# Semantic Search Application

In [1]:
import sys
import os
sys.path.append('..')
sys.path.append("webserver")
from aips import get_engine, get_semantic_knowledge_graph, get_entity_extractor
from semantic_search import *
from semantic_search.query_tree import *

engine = get_engine()
skg = get_semantic_knowledge_graph()

## Starting the Reviews Search Web Server and Launching the Search Page

In [2]:
def get_running_webservers():
    already_running_webservers = ! ps -ef | grep '[s]tart-webserver.py' | awk '{print $2}'
    return already_running_webservers
    
def stop_running_webservers():
    already_running_webservers = get_running_webservers()
    for pid in already_running_webservers:
        print("Stopping webserver (pid: " + pid + ")")
        results = ! xargs kill -9 {pid}

def start_reviews_search_webserver():
    stop_running_webservers() #in case it was already running
    get_ipython().system = os.system
    ! cd ../webserver && python start-webserver.py &
    if len(get_running_webservers()) > 0:
        print("Successfully Started Webserver (pid: " + get_running_webservers()[0] + ")!")

### Listing 7.2

In [3]:
#Start the web server
start_reviews_search_webserver()

Stopping webserver (pid: 9713)
Successfully Started Webserver (pid: 10007)!


### Figure 7.2

In [4]:
%%html
<iframe src="http://localhost:2345/search" width=100% height="800"></iframe>

### Figure 7.3

In [5]:
%%html
<iframe src="http://localhost:2345/search?q=bbq+near+charlotte" width=100% height="800"></iframe>

### Figure 7.4

In [6]:
%%html
<iframe src="http://localhost:2345/search?q=bbq+charlotte" width=100% height="800"></iframe>

### Figure 7.5

In [7]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=top+kimchi+near+charlotte&submit=false" width=100% height="800"></iframe>

### Figure 7.6

In [8]:
%%html
<iframe src="http://localhost:2345/search?q=top+kimchi+near+charlotte&submit=false" width=100% height="800"></iframe>

### Figure 7.7

In [9]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=top+kimchi+near+charlotte&submit=true" width=100% height="800"></iframe>

### Listing 7.3

In [10]:
!! cat ../data/reviews/entities.csv

['id,surface_form,canonical_form,type,popularity,semantic_function',
 '1,near,{location_distance},semantic_function,90,"location_distance(query, position)"',
 '2,in,{location_distance},semantic_function,100,"location_distance(query, position)"',
 '3,by,{location_distance},semantic_function,90,"location_distance(query, position)"',
 '4,by,{text_within_one_edit_distance},semantic_function,10,"text_within_one_edit_distance(query, position)"',
 '5,near,{text_distance},semantic_function,10,"text_distance(query, position)"',
 '6,popular,{popular},semantic_function,100,"popularity(query, position)"',
 '7,top,{popular},semantic_function,100,"popularity(query, position)"',
 '8,best,{popular},semantic_function,100,"popularity(query, position)"',
 '9,good,{popular},semantic_function,100,"popularity(query, position)"',
 '10,violet,violet,color,100,',
 '11,violet crowne,violet crowne,brand,100,',
 '12,violet crowne charlottesville,violet crowne charlottesville,movie_theater,100,',
 '13,violet crown

### Listing 7.4 : [located here](../ch07/1.index-datasets.ipynb#Listing-7.4).

### Listing 7.5 : [located here](../ch07/1.index-datasets.ipynb#Listing-7.5).

### Listing 7.6

In [11]:
query = "top kimchi near charlotte"
extractor = get_entity_extractor("entities")
query_entities = extractor.extract_entities(query)
query_entities

{'query': 'top kimchi near charlotte',
 'tags': [{'startOffset': 0, 'endOffset': 3, 'matchText': 'top', 'ids': ['7']},
  {'startOffset': 11, 'endOffset': 15, 'matchText': 'near', 'ids': ['1', '5']},
  {'startOffset': 16,
   'endOffset': 25,
   'matchText': 'charlotte',
   'ids': ['4460243', '4612828', '4680560', '4988584', '5234793']}],
 'entities': [{'semantic_function': 'location_distance(query, position)',
   'popularity': 90,
   'id': '1',
   'surface_form': 'near',
   'type': 'semantic_function',
   'canonical_form': '{location_distance}'},
  {'semantic_function': 'text_distance(query, position)',
   'popularity': 10,
   'id': '5',
   'surface_form': 'near',
   'type': 'semantic_function',
   'canonical_form': '{text_distance}'},
  {'semantic_function': 'popularity(query, position)',
   'popularity': 100,
   'id': '7',
   'surface_form': 'top',
   'type': 'semantic_function',
   'canonical_form': '{popular}'},
  {'country': 'US',
   'admin_area': 'NC',
   'popularity': 827097,
   

### Listing 7.7


In [12]:
def generate_tagged_query(extracted_entities):
    query = extracted_entities["query"]
    last_end = 0
    tagged_query = ""
    for tag in extracted_entities["tags"]:
        next_text = query[last_end:tag["startOffset"]].strip()
        if len(next_text) > 0:
            tagged_query += " " + next_text
        tagged_query += " {" + tag["matchText"] + "}"
        last_end = tag["endOffset"]
    if last_end < len(query):
        final_text = query[last_end:len(query)].strip()
        if len(final_text):
            tagged_query += " " + final_text
    return tagged_query   

tagged_query = generate_tagged_query(query_entities)
print(tagged_query)

 {top} kimchi {near} {charlotte}


### Listing 7.8

In [13]:
# %load -s generate_query_tree semantic_search/__init__.py
def generate_query_tree(extracted_entities):
    query = extracted_entities["query"]
    query_tree = []    
    last_end = 0
    doc_map =  {}
    for doc in extracted_entities["entities"]:
        doc_map[doc["id"]] = doc
        
    for tag in extracted_entities["tags"]:
        best_doc_id = None
        for doc_id in tag["ids"]:
            if best_doc_id:
                if (doc_map[doc_id]["popularity"] > 
                    doc_map[best_doc_id]["popularity"]):
                    best_doc_id = doc_id
            else:
                best_doc_id = doc_id
        best_doc = doc_map[best_doc_id]
        
        next_text = query[last_end:tag["startOffset"]].strip()
        if len(next_text) > 0:
            query_tree.append({
                "type": "keyword", "known": False,
                "surface_form": next_text,
                "canonical_form": next_text})
        query_tree.append(best_doc)
        last_end = tag["endOffset"]

    if last_end < len(query):
        final_text = query[last_end:len(query)].strip()
        if len(final_text) > 0:
            query_tree.append({ 
                "type": "keyword", "known": False, 
                "surface_form": final_text,
                "canonical_form": final_text})
    return query_tree

In [14]:
parsed_query = generate_query_tree(query_entities)
parsed_query

[{'semantic_function': 'popularity(query, position)',
  'popularity': 100,
  'id': '7',
  'surface_form': 'top',
  'type': 'semantic_function',
  'canonical_form': '{popular}'},
 {'type': 'keyword',
  'known': False,
  'surface_form': 'kimchi',
  'canonical_form': 'kimchi'},
 {'semantic_function': 'location_distance(query, position)',
  'popularity': 90,
  'id': '1',
  'surface_form': 'near',
  'type': 'semantic_function',
  'canonical_form': '{location_distance}'},
 {'country': 'US',
  'admin_area': 'NC',
  'popularity': 827097,
  'id': '4460243',
  'surface_form': 'Charlotte',
  'type': 'city',
  'location_coordinates': '35.22709,-80.84313',
  'canonical_form': 'Charlotte'}]

### Listing 7.9

In [15]:
# %load -s popularity semantic_search/semantic_functions/__init__.py
def popularity(query, position):
    if len(query["query_tree"]) -1 > position:
        query["query_tree"][position] = {
            "type": "transformed",
            "syntax": "solr",
            "query": '+{!func v="mul(if(stars_rating,stars_rating,0),20)"}'}
        return True
    else:
        return False

### Listing 7.10

In [16]:
# %load -s location_distance,create_geo_filter semantic_search/semantic_functions/__init__.py
def location_distance(query, position):
    if len(query["query_tree"]) -1 > position:
        next_entity = query["query_tree"][position + 1]
        if next_entity["type"] == "city":
            query["query_tree"].pop(position + 1)
            query["query_tree"][position] = {
                "type": "transformed",
                "syntax": "solr",
                "query": create_geo_filter(next_entity['location_coordinates'],
                                        "location_coordinates", 50)}
            return True
    return False

def create_geo_filter(coordinates, field, distance_in_KM):
    return "+{" + f'!geofilt d={distance_in_KM} sfield="{field}" pt="{coordinates}"' + '}'

### Listing 7.11


In [17]:
# %load -s process_semantic_functions semantic_search/query_tree/__init__.py
def process_semantic_functions(query_tree):
    position = 0
    while position < len(query_tree):
        node = query_tree[position]
        if node["type"] == "semantic_function":
            commaned_is_resolved = False
            if node["semantic_function"]:
                query = {"query_tree": query_tree}
                commaned_is_resolved = eval(node["semantic_function"])
            if not commaned_is_resolved:
                query_tree.pop(position)
        position += 1
    return query_tree 

### Listing 7.12

In [18]:
# %load -s get_enrichments semantic_search/query_tree/__init__.py
def get_enrichments(collection, keyword):
    enrichments = {}
    nodes_to_traverse = [{"field": "content",
                          "values": [keyword]},
                         [{"name": "related_terms",
                           "field": "content",
                           "limit": 3},
                          {"name": "doc_type",
                           "field": "doc_type",
                           "limit": 1}]]
    traversals = skg.traverse(collection, *nodes_to_traverse)
    nested_traversals = traversals["graph"][0]["values"][keyword]["traversals"]
    
    doc_types = list(filter(lambda t: t["name"] == "doc_type",
                            nested_traversals))
    if doc_types:
        enrichments["category"] = next(iter(doc_types[0]["values"]))
        
    term_vector = ""
    related_terms = list(filter(lambda t: t["name"] == "related_terms",
                                nested_traversals))
    if related_terms:
        for term, data in related_terms[0]["values"].items():
            term_vector += f'{term}^{round(data["relatedness"], 4)} '
    enrichments["term_vector"] = term_vector.strip()
    
    return enrichments

In [19]:
reviews_collection = engine.get_collection("reviews")
query = "kimchi"
get_enrichments(reviews_collection, query)

{'category': 'Korean',
 'term_vector': 'kimchi^0.9193 korean^0.7069 banchan^0.6593'}

In [20]:
other_queries = ["bbq", "korean bbq", "lasagna", "karaoke", "drive through"]
for query in other_queries:
    enrichments = get_enrichments(reviews_collection, query)
    print(f"{query}: {enrichments}")

bbq: {'category': 'Barbeque', 'term_vector': 'bbq^0.9191 ribs^0.6186 pork^0.5991'}
korean bbq: {'category': 'Korean', 'term_vector': 'bbq^0.9052 korean^0.8641 pork^0.6079'}
lasagna: {'category': 'Italian', 'term_vector': 'lasagna^0.9193 alfredo^0.3992 pasta^0.3909'}
karaoke: {'category': 'Karaoke', 'term_vector': 'karaoke^0.9193 sing^0.6423 songs^0.5256'}
drive through: {'category': 'Fast Food', 'term_vector': 'through^0.8999 drive^0.8613 thru^0.6118'}


### Listing 7.13

In [21]:
# %load -s enrich semantic_search/query_tree/__init__.py
def enrich(collection, query_tree):
    query_tree = process_semantic_functions(query_tree)    
    for i in range(len(query_tree)):
        item = query_tree[i]
        if item["type"] == "keyword":
            enrichments = get_enrichments(collection, item["surface_form"])
            query_tree[i] = {"type": "skg_enriched", 
                             "enrichments": enrichments}                    
    return query_tree

### Listing 7.14

In [22]:
# %load -s escape_quotes_in_query,transform_query semantic_search/query_tree/__init__.py
def escape_quotes(text):
    return text.replace('"', '\\"')

def transform_query(query_tree):
    for i in range(len(query_tree)):
        item = query_tree[i]
        transformed_query = ""
        match item["type"]:
            case "transformed":
                pass
            case "skg_enriched":
                enrichments = item["enrichments"]
                query_string = ""
                
                if "term_vector" in enrichments:
                    query_string = enrichments["term_vector"]
                if "category" in enrichments and len(query_string) > 0:
                    query_string += f' +doc_type:"{enrichments["category"]}"'
                if (len(query_string) == 0):
                    query_string = item["surface_form"]
                    
                transformed_query = '{!edismax v="' + escape_quotes(query_string) + '"}'
            case "color":
                transformed_query = f'+colors_s:"{item["canonical_form"]}"'
            case "known_item" | "event":
                transformed_query = f'+name_s:"{item["canonical_form"]}"'
            case "city":
                transformed_query = f'+city:"{str(item["name"])}"'
            case "brand":
                transformed_query = f'+brand_s:"{item["canonical_form"]}"'
            case _:
                transformed_query = "+{!edismax v=\"" + escape_quotes(item["surface_form"]) + "\"}"
        if transformed_query:
            query_tree[i] = {"type": "transformed",
                            "syntax": "solr",
                            "query": transformed_query}                 
    return query_tree


In [23]:
query = "good kimchi near charlotte"
reviews_collection = engine.get_collection("reviews")
tagger_data = knowledge_graph.extract_entities(query)
query_tree = generate_query_tree(tagger_data)
enriched_query_tree = enrich(reviews_collection, query_tree)
processed_query_tree = transform_query(enriched_query_tree)
print(json.dumps(processed_query_tree, indent="  "))

[
  {
    "type": "transformed",
    "syntax": "solr",
    "query": "+{!func v=\"mul(if(stars_rating,stars_rating,0),20)\"}"
  },
  {
    "type": "transformed",
    "syntax": "solr",
    "query": "{!edismax v=\"kimchi^0.9193 korean^0.7069 banchan^0.6593 +doc_type:\\\"Korean\\\"\"}"
  },
  {
    "type": "transformed",
    "syntax": "solr",
    "query": "+{!geofilt d=50 sfield=\"location_coordinates\" pt=\"35.22709,-80.84313\"}"
  }
]


### Listing 7.15

In [24]:
# %load -s to_query_string semantic_search/query_tree/__init__.py
def to_query_string(query_tree):
    return " ".join([node["query"] for node in query_tree])

In [25]:
query_string = to_query_string(query_tree)
query_string

'+{!func v="mul(if(stars_rating,stars_rating,0),20)"} {!edismax v="kimchi^0.9193 korean^0.7069 banchan^0.6593 +doc_type:\\"Korean\\""} +{!geofilt d=50 sfield="location_coordinates" pt="35.22709,-80.84313"}'

In [26]:
reviews_collection = engine.get_collection("reviews")
reviews_collection.search(query=query_string)

{'docs': [{'id': 'WnLhd38sH80ViWwzyF7yoA',
   'business_name': 'Hibiscus',
   'city': 'Charlotte',
   'state': 'NC',
   'content': "We ate here for dinner and had a very tasty meal of bibimbap and bulgogi. Both dishes were done well and tasty. We had a very pleasant waitress who provided great service as well. Overall great! We'll definitely be back.",
   'categories': 'Restaurants, Korean, Thai',
   'doc_type': 'Restaurants, Korean, Thai',
   'stars_rating': 5,
   'location_coordinates': '35.171873,-80.849032',
   '_version_': 1798185590184738817},
  {'id': 'DlKuPBPEvU2ynAYzZkmH2Q',
   'business_name': 'Cho Won Garden',
   'city': 'Charlotte',
   'state': 'NC',
   'content': 'Johnny P. Sorry you didnt like the food. The food, service, and \nprices were great. You said go to pepero because you are immature\nAnd you dont know why you said that. This place is a 8/10 if you know\nOf korean culture. This place is good for any occasion..dates too so dont \nbe afraid to try It out.',
   'cat

In [27]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=good+kimchi+near+charlotte&submit=true" width=100% height="800"></iframe>

In [28]:
#Cleanup so webserver doesn't keep running after you're done
stop_running_webservers()

Stopping webserver (pid: 10007)


## Success!

Up next: Chapter 8 - [Signals Boosting Models](../ch08/1.signals-boosting.ipynb)