# Semantic Search Application

In [1]:
import sys
sys.path.append('..')
sys.path.append("webserver")
from aips import *
from webserver.semantic_search import *
import webserver.semantic_search.engine.semantic_knowledge_graph as skg
from webserver.semantic_search.engine.text_tagger import *
from webserver.semantic_search.query_tree import *
engine = get_engine()

## Starting the Reviews Search Web Server and Launching the Search Page

In [2]:
def get_running_webservers():
    already_running_webservers = ! ps -ef | grep '[s]tart-webserver.py' | awk '{print $2}'
    return already_running_webservers
    
def stop_running_webservers():
    already_running_webservers = get_running_webservers()
    for pid in already_running_webservers:
        print("Stopping webserver (pid: " + pid + ")")
        results = ! xargs kill -9 {pid}

def start_reviews_search_webserver():
    stop_running_webservers() #in case it was already running
    get_ipython().system = os.system
    ! cd ../webserver && python start-webserver.py &
    if len(get_running_webservers()) > 0:
        print("Successfully Started Webserver (pid: " + get_running_webservers()[0] + ")!")

### Listing 7.2

In [3]:
#Start the web server
start_reviews_search_webserver()

Successfully Started Webserver (pid: 7887)!


### Figure 7.2

In [4]:
%%html
<iframe src="http://localhost:2345/search" width=100% height="800"></iframe>

### Figure 7.3

In [5]:
%%html
<iframe src="http://localhost:2345/search?q=bbq+near+charlotte" width=100% height="800"></iframe>

### Figure 7.4

In [6]:
%%html
<iframe src="http://localhost:2345/search?q=bbq+charlotte" width=100% height="800"></iframe>

### Figure 7.5

In [7]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=top+kimchi+near+charlotte&submit=false" width=100% height="800"></iframe>

### Figure 7.6

In [8]:
%%html
<iframe src="http://localhost:2345/search?q=top+korean+bbq+near+charlotte&submit=false" width=100% height="800"></iframe>

### Figure 7.7

In [9]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=top+korean+bbq+near+charlotte&submit=true" width=100% height="800"></iframe>

### Listing 7.3

In [None]:
!! cat ../data/reviews/entities.csv

### Listing 7.4 : [located here](../ch07/1.index-datasets.ipynb#Listing-7.4).

### Listing 7.5 : [located here](../ch07/1.index-datasets.ipynb#Listing-7.5).

### Listing 7.6

In [2]:
query = "top kimchi near charlotte"
tagger = TextTagger("entities")
tagger_data = tagger.tag_query(query)
tagger_data

{'responseHeader': {'status': 0, 'QTime': 6},
 'tagsCount': 3,
 'tags': [{'startOffset': 0, 'endOffset': 3, 'matchText': 'top', 'ids': ['7']},
  {'startOffset': 11, 'endOffset': 15, 'matchText': 'near', 'ids': ['1', '5']},
  {'startOffset': 16,
   'endOffset': 25,
   'matchText': 'charlotte',
   'ids': ['4460243', '4612828', '4680560', '4988584', '5234793']}],
 'response': {'numFound': 8,
  'start': 0,
  'numFoundExact': True,
  'docs': [{'id': '1',
    'canonical_form': '{location_distance}',
    'type': 'semantic_function',
    'popularity': 90,
    'semantic_function': 'location_distance(query, position)'},
   {'id': '5',
    'canonical_form': '{text_distance}',
    'type': 'semantic_function',
    'popularity': 10,
    'semantic_function': 'text_distance(query, position)'},
   {'id': '7',
    'canonical_form': '{popular}',
    'type': 'semantic_function',
    'popularity': 100,
    'semantic_function': 'popularity(query, position)'},
   {'id': '4460243',
    'canonical_form': 'Char

### Listing 7.7


In [3]:
def generate_tagged_query(query, tagger_data):
    last_end = 0
    tagged_query = ""
    for tag in tagger_data["tags"]:
        next_text = query[last_end:tag["startOffset"]].strip()
        if len(next_text) > 0:
            tagged_query += " " + next_text
        tagged_query += " {" + tag["matchText"] + "}"
        last_end = tag["endOffset"]
    if last_end < len(query):
        final_text = query[last_end:len(query)].strip()
        if len(final_text):
            tagged_query += " " + final_text
    return tagged_query   

tagged_query = generate_tagged_query(query, tagger_data)
print(tagged_query)

 {top} kimchi {near} {charlotte}


### Listing 7.8

In [4]:
# %load -s generate_query_tree webserver/semantic_search/__init__.py
def generate_query_tree(query, tagger_data):
    query_tree = []    
    last_end = 0
    doc_map =  {}
    for doc in tagger_data["response"]["docs"]:
        doc_map[doc["id"]] = doc
        
    for tag in tagger_data["tags"]:
        best_doc_id = None
        for doc_id in tag["ids"]:
            if best_doc_id:
                if (doc_map[doc_id]["popularity"] > 
                    doc_map[best_doc_id]["popularity"]):
                    best_doc_id = doc_id
            else:
                best_doc_id = doc_id
        best_doc = doc_map[best_doc_id]
        
        next_text = query[last_end:tag["startOffset"]].strip()
        if len(next_text) > 0:
            query_tree.append({
                "type": "keyword", "known": False,
                "surface_form": next_text,
                "canonical_form": next_text})
        query_tree.append(best_doc)
        last_end = tag["endOffset"]

    if last_end < len(query):
        final_text = query[last_end:len(query)].strip()
        if len(final_text) > 0:
            query_tree.append({ 
                "type": "keyword", "known": False, 
                "surface_form": final_text,
                "canonical_form": final_text})
    return query_tree


In [5]:
parsed_query = generate_query_tree(query, tagger_data)
parsed_query

[{'id': '7',
  'canonical_form': '{popular}',
  'type': 'semantic_function',
  'popularity': 100,
  'semantic_function': 'popularity(query, position)'},
 {'type': 'keyword',
  'known': False,
  'surface_form': 'kimchi',
  'canonical_form': 'kimchi'},
 {'id': '1',
  'canonical_form': '{location_distance}',
  'type': 'semantic_function',
  'popularity': 90,
  'semantic_function': 'location_distance(query, position)'},
 {'id': '4460243',
  'canonical_form': 'Charlotte',
  'admin_area': 'NC',
  'popularity': 827097,
  'type': 'city',
  'location_p': '35.22709,-80.84313'}]

### Listing 7.9

In [6]:
# %load -s popularity webserver/semantic_search/semantic_functions/__init__.py
def popularity(query, position):
    if (len(query["query_tree"]) -1 > position):
        query["query_tree"][position] = {
            "type": "engine",
            "query": '+{!func v="mul(if(stars_i,stars_i,0),20)"}'}
        return True
    else:
        return False

### Listing 7.10

In [7]:
# %load -s location_distance,create_geo_filter webserver/semantic_search/semantic_functions/__init__.py
def location_distance(query, position):
    if (len(query["query_tree"]) -1 > position):
        next_entity = query["query_tree"][position + 1]
        if (next_entity["type"] == "city"):
            query["query_tree"].pop(position + 1)
            query["query_tree"][position] = {
                "type": "engine",
                "query": create_geo_filter(next_entity['location_p'],
                "location_p", 50)}
            return True
    return False

def create_geo_filter(coordinates, field, distance_in_KM):
    return "+{!geofilt d=" + str(distance_in_KM) + ' sfield="' + field + '" pt="' + coordinates + '"}'

### Listing 7.11


In [8]:
# %load -s process_semantic_functions webserver/semantic_search/query_tree/__init__.py
def process_semantic_functions(query_tree):
    position = 0
    while position < len(query_tree):
        node = query_tree[position]
        if node["type"] == "semantic_function":
            commaned_is_resolved = False
            if node["semantic_function"]:
                query = {"query_tree": query_tree}
                commaned_is_resolved = eval(node["semantic_function"])
            if not commaned_is_resolved:
                query_tree.pop(position)
        position += 1
    return query_tree 

### Listing 7.12

In [9]:
# %load -s get_enrichments webserver/semantic_search/query_tree/__init__.py
def get_enrichments(collection, keyword):
    enrichments = {}
    nodes_to_traverse = [{"field": "text_t", "values": [keyword]},
                         [{"name": "related_terms", "field":
                           "text_t", "limit": 3},
                          {"name": "doc_type", "field": "doc_type",
                           "limit": 1}]]
    traversals = skg.traverse(collection, *nodes_to_traverse)
    nested_traversals = traversals["graph"][0]["values"][keyword]["traversals"]
    
    doc_types = list(filter(lambda t: t["name"] == "doc_type",
                            nested_traversals))
    if doc_types:
        enrichments["category"] = next(iter(doc_types[0]["values"]))
        
    term_vector = ""
    related_terms = list(filter(lambda t: t["name"] == "related_terms",
                                nested_traversals))
    if related_terms:
        for term, data in related_terms[0]["values"].items():
            term_vector += f'{term}^{round(data["relatedness"], 4)} '
    enrichments["term_vector"] = term_vector.strip()
    
    return enrichments

In [10]:
query = "kimchi"
reviews_collection = engine.get_collection("reviews")

get_enrichments(reviews_collection, query)

{'category': 'Korean',
 'term_vector': 'kimchi^0.9193 korean^0.7069 banchan^0.6593'}

In [11]:
other_queries = ["bbq", "korean bbq", "lasagna", "karaoke", "drive through"]
for query in other_queries:
    enrichments = get_enrichments(reviews_collection, query)
    print(f"{query}: {enrichments}")

bbq: {'category': 'Barbeque', 'term_vector': 'bbq^0.9191 ribs^0.6186 pork^0.5991'}
korean bbq: {'category': 'Korean', 'term_vector': 'bbq^0.9052 korean^0.8641 pork^0.6079'}
lasagna: {'category': 'Italian', 'term_vector': 'lasagna^0.9193 alfredo^0.3992 pasta^0.3909'}
karaoke: {'category': 'Karaoke', 'term_vector': 'karaoke^0.9193 sing^0.6423 songs^0.5256'}
drive through: {'category': 'Fast Food', 'term_vector': 'through^0.8999 drive^0.8613 thru^0.6118'}


### Listing 7.13

In [12]:
# %load -s enrich webserver/semantic_search/query_tree/__init__.py
def enrich(collection, query_tree):
    query_tree = process_semantic_functions(query_tree)    
    for i in range(len(query_tree)):
        item = query_tree[i]
        if item["type"] == "keyword":
            enrichments = get_enrichments(collection, item["surface_form"])
            query_tree[i] = {"type": "skg_enriched", 
                             "enrichments": enrichments}                    
    return query_tree

### Listing 7.14

In [13]:
# %load -s escape_quotes_in_query,transform_query webserver/semantic_search/query_tree/__init__.py
def escape_quotes_in_query(query):
    return query.replace('"', '\\"')

def transform_query(query_tree):
    for i in range(len(query_tree)):
        item = query_tree[i]
        additional_query = ""
        match item["type"]:
            case "engine":
                pass
            case "skg_enriched":
                enrichments = item["enrichments"]
                query_string = ""
                
                if "term_vector" in enrichments:
                    query_string = enrichments["term_vector"]
                if "category" in enrichments and len(query_string) > 0:
                    query_string += f' +doc_type:"{enrichments["category"]}"'
                if (len(query_string) == 0):
                    query_string = item["surface_form"]
                    
                additional_query = '{!edismax v="' + escape_quotes_in_query(query_string) + '"}'
            case "color":
                additional_query = f'+colors_s:"{item["canonical_form"]}"'
            case "known_item" | "event":
                additional_query = f'+name_s:"{item["canonical_form"]}"'
            case "city":
                additional_query = f'+city_t:"{str(item["name"])}"'
            case "brand":
                additional_query = f'+brand_s:"{item["canonical_form"]}"'
            case _:
                additional_query = "+{!edismax v=\"" + escape_quotes_in_query(item["surface_form"]) + "\"}"
        if additional_query:
            query_tree[i] = {"type": "transformed",
                             "syntax": "solr",
                             "query": additional_query}                 
    return query_tree

In [14]:
query = "good kimchi near charlotte"
tagger = TextTagger("entities")
reviews_collection = engine.get_collection("reviews")
tagger_data = tagger.tag_query(query)
query_tree = generate_query_tree(query, tagger_data)
enriched_query_tree = enrich(reviews_collection, query_tree)
print(enriched_query_tree)
processed_query_tree = transform_query(enriched_query_tree)
print(processed_query_tree)

[{'type': 'engine', 'query': '+{!func v="mul(if(stars_i,stars_i,0),20)"}'}, {'type': 'skg_enriched', 'enrichments': {'category': 'Korean', 'term_vector': 'kimchi^0.9193 korean^0.7069 banchan^0.6593'}}, {'type': 'engine', 'query': '+{!geofilt d=50 sfield="location_p" pt="35.22709,-80.84313"}'}]
[{'type': 'engine', 'query': '+{!func v="mul(if(stars_i,stars_i,0),20)"}'}, {'type': 'transformed', 'syntax': 'solr', 'query': '{!edismax v="kimchi^0.9193 korean^0.7069 banchan^0.6593 +doc_type:\\"Korean\\""}'}, {'type': 'engine', 'query': '+{!geofilt d=50 sfield="location_p" pt="35.22709,-80.84313"}'}]


### Listing 7.15

In [15]:
# %load -s to_query_string webserver/semantic_search/query_tree/__init__.py
def to_query_string(query_tree):
    return " ".join([node["query"] for node in query_tree])

In [16]:
query_string = to_query_string(query_tree)
query_string

'+{!func v="mul(if(stars_i,stars_i,0),20)"} {!edismax v="kimchi^0.9193 korean^0.7069 banchan^0.6593 +doc_type:\\"Korean\\""} +{!geofilt d=50 sfield="location_p" pt="35.22709,-80.84313"}'

In [17]:
reviews_collection = engine.get_collection("reviews")
request = {"query": query_string}
reviews_collection.search(request)

{'responseHeader': {'zkConnected': True,
  'status': 0,
  'QTime': 46,
  'params': {'json': '{"query": "+{!func v=\\"mul(if(stars_i,stars_i,0),20)\\"} {!edismax v=\\"kimchi^0.9193 korean^0.7069 banchan^0.6593 +doc_type:\\\\\\"Korean\\\\\\"\\"} +{!geofilt d=50 sfield=\\"location_p\\" pt=\\"35.22709,-80.84313\\"}"}'}},
 'response': {'numFound': 15842,
  'start': 0,
  'numFoundExact': True,
  'docs': [{'id': 'WnLhd38sH80ViWwzyF7yoA',
    'name_t': 'Hibiscus',
    'city_t': 'Charlotte',
    'state_t': 'NC',
    'text_t': "We ate here for dinner and had a very tasty meal of bibimbap and bulgogi. Both dishes were done well and tasty. We had a very pleasant waitress who provided great service as well. Overall great! We'll definitely be back.",
    'stars_i': 5,
    'categories_t': 'Restaurants, Korean, Thai',
    'doc_type': ['Restaurants, Korean, Thai'],
    'location_pt_s': '35.171873,-80.849032',
    'location_p': '35.171873,-80.849032',
    'type_ss': ['Restaurants, Korean, Thai'],
    'l

In [18]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=good+kimchi+near+charlotte&submit=true" width=100% height="800"></iframe>

In [None]:
#Cleanup so webserver doesn't keep running after you're done
stop_running_webservers()

## Success!

Up next: Chapter 8 - [Signals Boosting Models](../ch08/1.signals-boosting.ipynb)