# Semantic Search Application

In [1]:
import sys
import os
import json
sys.path.append('../..')
sys.path.append("webserver")
from aips import get_engine, get_semantic_knowledge_graph, get_entity_extractor
from semantic_search import *
from semantic_search.query_tree import *

engine = get_engine()
reviews_collection = engine.get_collection("reviews")
entities_collection = engine.get_collection("entities")
skg = get_semantic_knowledge_graph(reviews_collection)

### Listing 7.2 & Figure 7.2
<a id='listing-7.2'></a>

In [2]:
def get_running_webservers():
    already_running_webservers = ! ps -ef | grep '[s]tart-webserver.py' | awk '{print $2}'
    return already_running_webservers
    
def stop_running_webservers():
    already_running_webservers = get_running_webservers()
    for pid in already_running_webservers:
        print("Stopping webserver (pid: " + pid + ")")
        results = ! xargs kill -9 {pid}

def start_reviews_search_webserver():
    stop_running_webservers() #in case it was already running
    get_ipython().system = os.system
    ! cd webserver && python start-webserver.py &
    if len(get_running_webservers()) > 0:
        print("Successfully Started Webserver (pid: " + get_running_webservers()[0] + ")!")

In [3]:
start_reviews_search_webserver()

Successfully Started Webserver (pid: 1104)!


In [4]:
%%html
<iframe src="http://localhost:2345/search" width=100% height="800"></iframe>

### Figure 7.3

In [5]:
%%html
<iframe src="http://localhost:2345/search?q=bbq+near+charlotte" width=100% height="800"></iframe>

### Figure 7.4

In [6]:
%%html
<iframe src="http://localhost:2345/search?q=bbq+charlotte" width=100% height="800"></iframe>

### Figure 7.5

In [7]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=top+kimchi+near+charlotte&submit=true" width=100% height="800"></iframe>

### Figure 7.6

In [8]:
%%html
<iframe src="http://localhost:2345/search?q=top+kimchi+near+charlotte&submit=false" width=100% height="800"></iframe>

### Figure 7.7

In [9]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=top+kimchi+near+charlotte&submit=true" width=100% height="800"></iframe>

In [10]:
stop_running_webservers()

Stopping webserver (pid: 1104)


### Listing 7.3 : [located here](1.index-datasets.ipynb#listing_7_3).
### Listing 7.4 : [located here](../ch07/1.index-datasets.ipynb#Listing-7.4).
### Listing 7.5 : [located here](../ch07/1.index-datasets.ipynb#Listing-7.5).

### Listing 7.6

In [11]:
query = "top kimchi near charlotte"
entities_collection = engine.get_collection("entities")
extractor = get_entity_extractor(entities_collection)
query_entities = extractor.extract_entities(query)
display(query_entities)

{'query': 'top kimchi near charlotte',
 'tags': [{'startOffset': 0, 'endOffset': 3, 'matchText': 'top', 'ids': ['7']},
  {'startOffset': 11, 'endOffset': 15, 'matchText': 'near', 'ids': ['1', '5']},
  {'startOffset': 16,
   'endOffset': 25,
   'matchText': 'charlotte',
   'ids': ['4460243', '4612828', '4680560', '4988584', '5234793']}],
 'entities': [{'semantic_function': 'location_distance(query, position)',
   'popularity': 90,
   'id': '1',
   'surface_form': 'near',
   'type': 'semantic_function',
   'canonical_form': '{location_distance}'},
  {'semantic_function': 'text_distance(query, position)',
   'popularity': 10,
   'id': '5',
   'surface_form': 'near',
   'type': 'semantic_function',
   'canonical_form': '{text_distance}'},
  {'semantic_function': 'popularity(query, position)',
   'popularity': 100,
   'id': '7',
   'surface_form': 'top',
   'type': 'semantic_function',
   'canonical_form': '{popular}'},
  {'country': 'US',
   'admin_area': 'NC',
   'popularity': 827097,
   

### Listing 7.7


In [12]:
def generate_tagged_query(extracted_entities):
    query = extracted_entities["query"]
    last_end = 0
    tagged_query = ""
    for tag in extracted_entities["tags"]:
        next_text = query[last_end:tag["startOffset"]].strip()
        if len(next_text) > 0:
            tagged_query += " " + next_text
        tagged_query += " {" + tag["matchText"] + "}"
        last_end = tag["endOffset"]
    if last_end < len(query):
        final_text = query[last_end:len(query)].strip()
        if len(final_text):
            tagged_query += " " + final_text
    return tagged_query   

tagged_query = generate_tagged_query(query_entities)
print(tagged_query)

 {top} kimchi {near} {charlotte}


### Listing 7.8

In [13]:
# %load -s generate_query_tree semantic_search/__init__.py
def generate_query_tree(extracted_entities):
    query = extracted_entities["query"]
    entities = {entity["id"]: entity for entity
                in extracted_entities["entities"]}
    query_tree = []    
    last_end = 0
    
    for tag in extracted_entities["tags"]:
        best_entity = entities[tag["ids"][0]]
        for entity_id in tag["ids"]:
            if (entities[entity_id]["popularity"] > 
                best_entity["popularity"]):
                best_entity = entities[entity_id]
        
        next_text = query[last_end:tag["startOffset"]].strip()
        if next_text:
            query_tree.append({"type": "keyword",
                               "surface_form": next_text,
                               "canonical_form": next_text})
        query_tree.append(best_entity)
        last_end = tag["endOffset"]

    if last_end < len(query):
        final_text = query[last_end:len(query)].strip()
        if final_text:
            query_tree.append({"type": "keyword",
                               "surface_form": final_text,
                               "canonical_form": final_text})
    return query_tree

In [14]:
parsed_query = generate_query_tree(query_entities)
display(parsed_query)

[{'semantic_function': 'popularity(query, position)',
  'popularity': 100,
  'id': '7',
  'surface_form': 'top',
  'type': 'semantic_function',
  'canonical_form': '{popular}'},
 {'type': 'keyword', 'surface_form': 'kimchi', 'canonical_form': 'kimchi'},
 {'semantic_function': 'location_distance(query, position)',
  'popularity': 90,
  'id': '1',
  'surface_form': 'near',
  'type': 'semantic_function',
  'canonical_form': '{location_distance}'},
 {'country': 'US',
  'admin_area': 'NC',
  'popularity': 827097,
  'id': '4460243',
  'surface_form': 'Charlotte',
  'type': 'city',
  'location_coordinates': '35.22709,-80.84313',
  'canonical_form': 'Charlotte'}]

### Listing 7.9

In [15]:
# %load -s popularity semantic_search/semantic_functions/__init__.py
def popularity(query, position):
    if len(query["query_tree"]) -1 > position:
        query["query_tree"][position] = {
            "type": "transformed",
            "syntax": "solr",
            "query": '+{!func v="mul(if(stars_rating,stars_rating,0),20)"}'}
        return True
    return False

### Listing 7.10

In [16]:
# %load -s location_distance,create_geo_filter semantic_search/semantic_functions/__init__.py
def location_distance(query, position):
    if len(query["query_tree"]) -1 > position:
        next_entity = query["query_tree"][position + 1]
        if next_entity["type"] == "city":
            query["query_tree"].pop(position + 1)
            query["query_tree"][position] = {
                "type": "transformed",
                "syntax": "solr",
                "query": create_geo_filter(
                    next_entity['location_coordinates'],
                    "location_coordinates", 50)}
            return True
    return False

def create_geo_filter(coordinates, field, distance_KM):
    clause = f'!geofilt d={distance_KM} sfield="{field}" pt="{coordinates}"'
    return "+{" + clause + '}'

### Listing 7.11


In [17]:
# %load -s process_semantic_functions semantic_search/query_tree.py
def process_semantic_functions(query_tree):
    position = 0
    while position < len(query_tree):
        node = query_tree[position]
        if node["type"] == "semantic_function":
            query = {"query_tree": query_tree} 
            command_successful = eval(node["semantic_function"])
            if not command_successful:
                node["type"] = "invalid_semantic_function"
        position += 1
    return query_tree 

### Listing 7.12

In [18]:
# %load -s get_enrichments semantic_search/query_tree.py
def get_enrichments(collection, keyword, limit=4):
    enrichments = {}
    nodes_to_traverse = [{"field": "content",
                          "values": [keyword],
                          "default_operator": "OR"},
                         [{"name": "related_terms",
                           "field": "content",
                           "limit": limit},
                          {"name": "doc_type",
                           "field": "doc_type",
                           "limit": 1}]]
    skg = get_semantic_knowledge_graph(collection)
    traversals = skg.traverse(*nodes_to_traverse)
    if "traversals" not in traversals["graph"][0]["values"][keyword]:
        return enrichments
    
    nested_traversals = traversals["graph"][0]["values"] \
                                  [keyword]["traversals"]
    
    doc_types = list(filter(lambda t: t["name"] == "doc_type",
                            nested_traversals))
    if doc_types:
        enrichments["category"] = next(iter(doc_types[0]["values"]))
        
    related_terms = list(filter(lambda t: t["name"] == "related_terms",
                                nested_traversals))
    if related_terms:
        term_vector = ""
        for term, data in related_terms[0]["values"].items():
            term_vector += f'{term}^{round(data["relatedness"], 4)} '
        enrichments["term_vector"] = term_vector.strip()
    
    return enrichments

In [19]:
query = "kimchi"
get_enrichments(reviews_collection, query)

{'category': 'Korean',
 'term_vector': 'kimchi^0.9193 korean^0.7069 banchan^0.6593 bulgogi^0.5497'}

In [20]:
other_queries = ["bbq", '"korean bbq"', "lasagna", "karaoke", '"drive through"']
for query in other_queries:
    enrichments = get_enrichments(reviews_collection, query)
    print(f"{query}: {enrichments}")

bbq: {'category': 'Barbeque', 'term_vector': 'bbq^0.9191 ribs^0.6187 pork^0.5992 brisket^0.5691'}
"korean bbq": {'category': 'Korean', 'term_vector': 'korean^0.7754 bbq^0.6716 banchan^0.5534 sariwon^0.5211'}
lasagna: {'category': 'Italian', 'term_vector': 'lasagna^0.9193 alfredo^0.3992 pasta^0.3909 italian^0.3742'}
karaoke: {'category': 'Karaoke', 'term_vector': 'karaoke^0.9193 sing^0.6423 songs^0.5256 song^0.4118'}
"drive through": {'category': 'Fast Food', 'term_vector': "drive^0.7428 through^0.6331 mcdonald's^0.2873 window^0.2643"}


### Listing 7.13

In [21]:
# %load -s enrich semantic_search/query_tree.py
def enrich(collection, query_tree):
    query_tree = process_semantic_functions(query_tree)    
    for item in query_tree:
        if item["type"] == "keyword":
            enrichments = get_enrichments(collection, item["surface_form"])
            if enrichments:
                item["type"] = "skg_enriched"
                item["enrichments"] = enrichments
    return query_tree

### Listing 7.14

In [22]:
%%script false
#Currently stubbed out for M.2 Issue with onnxruntime-gpu
from spladerunner import Expander
expander = Expander('Splade_PP_en_v1', 128)
queries = ["kimchi", "bbq", "korean bbq", 
           "lasagna", "karaoke", "drive through"]

for query in queries:
  sparse_vec = expander.expand(query,
                               outformat="lucene")[0]
  print(sparse_vec)     

{'kim': 3.11, '##chi': 3.04, 'ki': 1.52, ',': 0.92, 'who': 0.72, 'brand': 0.56, 'genre': 0.46, 'chi': 0.45, '##chy': 0.45, 'company': 0.41, 'only': 0.39, 'take': 0.31, 'club': 0.25, 'species': 0.22, 'color': 0.16, 'type': 0.15, 'but': 0.13, 'dish': 0.12, 'hotel': 0.11, 'music': 0.09, 'style': 0.08, 'name': 0.06, 'religion': 0.01}
{'bb': 2.78, 'grill': 1.85, 'barbecue': 1.36, 'dinner': 0.91, '##q': 0.78, 'dish': 0.77, 'restaurant': 0.65, 'sport': 0.46, 'food': 0.34, 'style': 0.34, 'eat': 0.24, 'a': 0.23, 'genre': 0.12, 'definition': 0.09}
{'korean': 2.84, 'korea': 2.56, 'bb': 2.23, 'grill': 1.58, 'dish': 1.21, 'restaurant': 1.18, 'barbecue': 0.79, 'kim': 0.67, 'food': 0.64, 'dinner': 0.39, 'restaurants': 0.32, 'japanese': 0.31, 'eat': 0.27, 'hotel': 0.16, 'famous': 0.11, 'brand': 0.11, '##q': 0.06, 'diner': 0.02}
{'las': 2.87, '##ag': 2.85, '##na': 2.39, ',': 0.84, 'she': 0.5, 'species': 0.34, 'hotel': 0.33, 'club': 0.31, 'location': 0.3, 'festival': 0.29, 'company': 0.27, 'her': 0.2, '

In [None]:
"kimchi"
{'kim': 3.11, '##chi': 3.04, 'ki': 1.52, ',': 0.92, 'who': 0.72,
 'brand': 0.56, 'genre': 0.46, 'chi': 0.45, '##chy': 0.45,
 'company': 0.41, 'only': 0.39, 'take': 0.31, 'club': 0.25,
 'species': 0.22, 'color': 0.16, 'type': 0.15, 'but': 0.13,
 'dish': 0.12, 'hotel': 0.11, 'music': 0.09, 'style': 0.08,
 'name': 0.06, 'religion': 0.01}

"bbq"
{'bb': 2.78, 'grill': 1.85, 'barbecue': 1.36, 'dinner': 0.91,
 '##q': 0.78, 'dish': 0.77, 'restaurant': 0.65, 'sport': 0.46,
 'food': 0.34, 'style': 0.34, 'eat': 0.24, 'a': 0.23, 'genre': 0.12,
 'definition': 0.09}

"korean bbq"
{'korean': 2.84, 'korea': 2.56, 'bb': 2.23, 'grill': 1.58, 'dish': 1.21,
 'restaurant': 1.18, 'barbecue': 0.79, 'kim': 0.67, 'food': 0.64,
 'dinner': 0.39, 'restaurants': 0.32, 'japanese': 0.31, 'eat': 0.27,
 'hotel': 0.16, 'famous': 0.11, 'brand': 0.11, '##q': 0.06, 'diner': 0.02}

"lasagna"
{'las': 2.87, '##ag': 2.85, '##na': 2.39, ',': 0.84, 'she': 0.5,
 'species': 0.34, 'hotel': 0.33, 'club': 0.31, 'location': 0.3,
 'festival': 0.29, 'company': 0.27, 'her': 0.2, 'city': 0.12,
 'genre': 0.05}

"karaoke"
{'kara': 3.04, '##oke': 2.87, 'music': 1.31, 'lara': 1.07,
 'song': 1.03, 'dance': 0.97, 'style': 0.94, 'sara': 0.81,
 'genre': 0.75, 'dress': 0.48, 'dish': 0.44, 'singer': 0.37,
 'hannah': 0.36, 'brand': 0.31, 'who': 0.29, 'culture': 0.21,
 'she': 0.17, 'mix': 0.17, 'popular': 0.12, 'girl': 0.12,
 'kelly': 0.08, 'wedding': 0.0}

"drive through"
{'through': 2.94, 'drive': 2.87, 'driving': 2.34, 'past': 1.75,
 'drives': 1.65, 'thru': 1.44, 'driven': 1.22, 'enter': 0.81,
 'drove': 0.81, 'pierce': 0.75, 'in': 0.72, 'by': 0.71, 'into': 0.64,
 'travel': 0.59, 'mark': 0.51, ';': 0.44, 'clear': 0.41,
 'transport': 0.41, 'route': 0.39, 'within': 0.36, 'vehicle': 0.3,
 'via': 0.15}

### Listing 7.15

In [23]:
# %load -s escape_quotes_in_query,transform_query semantic_search/query_tree.py
def escape_quotes(text):
    return text.replace('"', '\\"')

def transform_query(query_tree):
    for i, item in enumerate(query_tree):
        match item["type"]:
            case "transformed":
                continue
            case "skg_enriched":
                enrichments = item["enrichments"]  
                if "term_vector" in enrichments:
                    query_string = enrichments["term_vector"]
                    if "category" in enrichments:
                        query_string += f' +doc_type:"{enrichments["category"]}"'
                    transformed_query = '+{!edismax v="' + escape_quotes(query_string) + '"}'
                else:
                    continue
            case "color":
                transformed_query = f'+colors_s:"{item["canonical_form"]}"'
            case "known_item" | "event":
                transformed_query = f'+name_s:"{item["canonical_form"]}"'
            case "city":
                transformed_query = f'+city:"{str(item["canonical_form"])}"'
            case "brand":
                transformed_query = f'+brand_s:"{item["canonical_form"]}"'
            case _:
                transformed_query = "+{!edismax v=\"" + escape_quotes(item["surface_form"]) + "\"}"
        query_tree[i] = {"type": "transformed",
                         "syntax": "solr",
                         "query": transformed_query}                 
    return query_tree

In [24]:
query = "top kimchi near Charlotte"
tagger_data = extractor.extract_entities(query)
query_tree = generate_query_tree(tagger_data)
enriched_query_tree = enrich(reviews_collection, query_tree)
processed_query_tree = transform_query(enriched_query_tree)
display(processed_query_tree)

[{'type': 'transformed',
  'syntax': 'solr',
  'query': '+{!func v="mul(if(stars_rating,stars_rating,0),20)"}'},
 {'type': 'transformed',
  'syntax': 'solr',
  'query': '+{!edismax v="kimchi^0.9193 korean^0.7069 banchan^0.6593 bulgogi^0.5497 +doc_type:\\"Korean\\""}'},
 {'type': 'transformed',
  'syntax': 'solr',
  'query': '+{!geofilt d=50 sfield="location_coordinates" pt="35.22709,-80.84313"}'}]

### Listing 7.16

In [25]:
# %load -s to_query_string semantic_search/query_tree.py
def to_query_string(query_tree):
    return " ".join([node["query"] for node in query_tree])

In [26]:
query_string = to_query_string(query_tree)

In [33]:
reviews_collection.search(query=query_string,
                          return_fields=["business_name", "score"])

{'docs': [{'business_name': 'Hibiscus', 'score': 103.72602},
  {'business_name': 'Cho Won Garden', 'score': 103.72602},
  {'business_name': 'Korean Restaurant', 'score': 103.72602},
  {'business_name': 'Simplee Sushi', 'score': 103.45139},
  {'business_name': 'CO', 'score': 103.22704},
  {'business_name': 'Kojan Bistro', 'score': 83.72602},
  {'business_name': 'Bulgogi Box', 'score': 83.72602},
  {'business_name': 'China Wing', 'score': 83.72602},
  {'business_name': 'Fujiyama', 'score': 83.45139},
  {'business_name': 'PePeRo', 'score': 83.45139}]}

In [28]:
start_reviews_search_webserver()

Successfully Started Webserver (pid: 1188)!


In [29]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=good+kimchi+near+charlotte&submit=true" width=100% height="800"/>

In [30]:
#Cleanup so webserver doesn't keep running after you're done
stop_running_webservers()

Stopping webserver (pid: 1188)


## Success!
Up next: Chapter 8 - [Signals Boosting Models](../ch08/1.signals-boosting.ipynb)