# Semantic Search Application

In [1]:
import sys
sys.path.append('..')
sys.path.append('../webserver')
from semantic_search.engine.tag_query import *
from semantic_search.process_semantic_query import *
from semantic_search.query_tree.query_tree_to_transformed_query import *
from aips import *
import json

## Starting the Reviews Search Web Server and Launching the Search Page

In [2]:
def get_running_webservers():
    already_running_webservers = ! ps -ef | grep '[s]tart-webserver.py' | awk '{print $2}'
    return already_running_webservers
    
def stop_running_webservers():
    already_running_webservers = get_running_webservers()
    for pid in already_running_webservers:
        print("Stopping webserver (pid: " + pid + ")")
        results = ! xargs kill -9 {pid}

def start_reviews_search_webserver():
    stop_running_webservers() #in case it was already running
    get_ipython().system = os.system
    ! cd ../webserver && python start-webserver.py &
    if len(get_running_webservers()) > 0:
        print("Successfully Started Webserver (pid: " + get_running_webservers()[0] + ")!")

### Listing 7.2

In [3]:
#Start the web server
start_reviews_search_webserver()

Stopping webserver (pid: 44378)
Successfully Started Webserver (pid: 45351)!


In [4]:
%%html
<iframe src="http://localhost:2345/search" width=100% height="800"></iframe>


### Figure 7.3

In [5]:
%%html
<iframe src="http://localhost:2345/search?q=bbq+near+charlotte" width=100% height="800"></iframe>

### Figure 7.4

In [6]:
%%html
<iframe src="http://localhost:2345/search?q=bbq+charlotte" width=100% height="800"></iframe>

### Figure 7.5

In [7]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=top+kimchi+near+charlotte&submit=false" width=100% height="800"></iframe>

### Figure 7.6

In [8]:
%%html
<iframe src="http://localhost:2345/search?q=top+kimchi+near+charlotte&submit=false" width=100% height="800"></iframe>

### Figure 7.7

In [9]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=top+kimchi+near+charlotte&submit=true" width=100% height="800"></iframe>

### Listing 7.3

In [10]:
!! cat ../data/reviews/entities.csv

['id,surface_form,canonical_form,type,popularity,semantic_function',
 '1,near,{location_distance},semantic_function,90,"location_distance(query, position)"',
 '2,in,{location_distance},semantic_function,100,"location_distance(query, position)"',
 '3,by,{location_distance},semantic_function,90,"location_distance(query, position)"',
 '4,by,{text_within_one_edit_distance},semantic_function,10,"text_within_one_edit_distance(query, ',
 'position)"',
 '5,near,{text_distance},semantic_function,10,"text_distance(query, position)"',
 '6,popular,{popular},semantic_function,100,"popularity(query, position)"',
 '7,top,{popular},semantic_function,100,"popularity(query, position)"',
 '8,best,{popular},semantic_function,100,"popularity(query, position)"',
 '9,good,{popular},semantic_function,100,"popularity(query, position)"',
 '10,violet,violet,color,100,',
 '11,violet crowne,violet crowne,brand,100,',
 '12,violet crowne charlottesville,violet crowne charlottesville,movie_theater,100,',
 '13,violet 

### Listing 7.6

In [11]:
# %load -s tag_query ../webserver/semantic_search/engine/tag_query
def tag_query(post_body):
    return requests.post(SOLR_URL + '/entities/tag?json.nl=map&sort=popularity%20desc&matchText=true&echoParams=all&fl=id,type,canonical_form,name,country:countrycode_s,admin_area:admin_code_1_s,popularity,*_p,semantic_function', post_body).text


In [12]:
json.loads(tag_query("top kimchi near charlotte"))

{'responseHeader': {'status': 0,
  'QTime': 0,
  'params': {'sort': 'popularity desc',
   'matchText': 'true',
   'json.nl': 'map',
   'field': 'name_tag',
   'echoParams': 'all',
   'fl': 'id,type,canonical_form,name,country:countrycode_s,admin_area:admin_code_1_s,popularity,*_p,semantic_function'}},
 'tagsCount': 3,
 'tags': [{'startOffset': 0, 'endOffset': 3, 'matchText': 'top', 'ids': ['7']},
  {'startOffset': 11, 'endOffset': 15, 'matchText': 'near', 'ids': ['1', '5']},
  {'startOffset': 16,
   'endOffset': 25,
   'matchText': 'charlotte',
   'ids': ['4460243', '4612828', '4680560', '4988584', '5234793']}],
 'response': {'numFound': 8,
  'start': 0,
  'numFoundExact': True,
  'docs': [{'id': '1',
    'canonical_form': '{location_distance}',
    'type': 'semantic_function',
    'popularity': 90,
    'semantic_function': 'location_distance(query, position)'},
   {'id': '5',
    'canonical_form': '{text_distance}',
    'type': 'semantic_function',
    'popularity': 10,
    'semantic_

### Listing 7.7

In [13]:
# %load -s process_semantic_query ../webserver/semantic_search/process_semantic_query
def process_semantic_query(query_bytes):
    text = query_bytes.decode('UTF-8')
    tagger_data = json.loads(tag_query(query_bytes))
    
    query_tree, tagged_query, enriched_query = generate_query_representations(text, tagger_data)
        
    final_query = process_query_tree(query_tree)
    transformed_query = query_tree_to_transformed_query(query_tree)      

    response = {
        "tagged_query": tagged_query,
        "enriched_query": enriched_query, 
        "transformed_query": transformed_query,
        "tagger_data": tagger_data
    }

    return response


In [14]:
query_bytes = bytes("top kimchi near charlotte", 'UTF-8')
processed_query = process_semantic_query(query_bytes)
print("Tagged Query:" + processed_query["tagged_query"])
print("\nEnriched Query:" + processed_query["enriched_query"])
print("\nTransformed Query:" + processed_query["transformed_query"])

Tagged Query: {top} kimchi {near} {charlotte}

Enriched Query:{"id": "7", "canonical_form": "{popular}", "type": "semantic_function", "popularity": 100, "semantic_function": "popularity(query, position)"} { type:keyword, known: false, surface_form: "kimchi"}{"id": "1", "canonical_form": "{location_distance}", "type": "semantic_function", "popularity": 90, "semantic_function": "location_distance(query, position)"}{"id": "4460243", "name": ["Charlotte"], "canonical_form": "Charlotte", "popularity": 827097, "type": "city", "location_p": "35.22709,-80.84313", "admin_area": "NC"}

Transformed Query:+{!func v="mul(if(stars_i,stars_i,0),20)"} +{!edismax v="kimchi^0.9193 korean^0.7069 banchan^0.6593 +doc_type:\"Korean\""} +{!geofilt d=50 sfield="location_p" pt="35.22709,-80.84313"}


### Listing 7.8

In [15]:
# %load -s popularity ../webserver/semantic_search/semantic_functions/popularity
def popularity(query, position):
    if (len(query['query_tree']) -1 > position):
        query['query_tree'][position] = {"type":"solr", "query": '+{!func v="mul(if(stars_i,stars_i,0),20)"}'}
        return True
    else:
        return False


### Listing 7.9

In [16]:
# %load -s location_distance,create_geo_filter ../webserver/semantic_search/semantic_functions/location_distance
def location_distance(query, position):      
    if (len(query['query_tree']) -1 > position):
        nextEntity = query['query_tree'][position + 1]
        if (nextEntity['type'] == "city"):
        
            query['query_tree'].pop(position + 1);
            query['query_tree'][position] = {"type":"solr", 
                                             "query": create_geo_filter(nextEntity['location_p'], 
                                             "location_p", 50)}
            return True
    return False

def create_geo_filter(coordinates, field, distanceInKM):
    return "+{!geofilt d=" + str(distanceInKM) + " sfield=\"" + field + "\" pt=\"" + coordinates + "\"}"


### Listing 7.10

In [17]:
#NOTE: DELETE structured_search when Daniel converts over to collection.search(...)
# %load -s structured_search ../webserver/semantic_search/engine/structured_search
def structured_search(json_query):
    x = json.dumps(json_query)
    return requests.post(SOLR_URL + '/reviews/select', json=json_query).text

In [18]:
# %load -s traverse_skg ../webserver/semantic_search/engine/semantic_knowledge_graph
def traverse_skg(keyword):
    query = {
      "params": { "fore": keyword, "back": "*:*", "df": "text_t" },
      "query": "*:*", "limit": 0,
      "facet": {
        "term_needing_vector": {
          "type": "query", "query": keyword,
          "facet": {
            "related_terms" : {
              "type" : "terms", "field" : "text_t",
              "limit": 3, "sort": { "r1": "desc" },
              "facet" :
                { "r1" : "relatedness($fore,$back)" }},
            "doc_type" : {
              "type" : "terms", "field" : "doc_type",
              "limit": 1, "sort": { "r2": "desc" },
              "facet" :
                { "r2" : "relatedness($fore,$back)" }}}}}}

    response = structured_search(query)
    return json.loads(response)


In [19]:
# %load -s parse_skg_response ../webserver/semantic_search/engine/semantic_knowledge_graph
def parse_skg_response(skg_response):
    parsed = {}
    relatedTermNodes = {}

    if ('facets' in skg_response and 'term_needing_vector' in skg_response['facets']):
    
        if ('doc_type' in skg_response['facets']['term_needing_vector'] 
          and 'buckets' in skg_response['facets']['term_needing_vector']['doc_type'] 
          and len(skg_response['facets']['term_needing_vector']['doc_type']['buckets']) > 0 ):

            parsed['category'] = skg_response['facets']['term_needing_vector']['doc_type']['buckets'][0]['val'] #just top one for now
    
        if ('related_terms' in skg_response['facets']['term_needing_vector'] 
          and 'buckets' in skg_response['facets']['term_needing_vector']['related_terms'] 
          and len(skg_response['facets']['term_needing_vector']['related_terms']['buckets']) > 0 ): #at least one entry
    
            relatedTermNodes = skg_response['facets']['term_needing_vector']['related_terms']['buckets']
    
    termVector = ""
    for relatedTermNode in relatedTermNodes:
        if (len(termVector) > 0):  termVector += " " 
        termVector += relatedTermNode['val'] + "^" + "{:.4f}".format(relatedTermNode['r1']['relatedness'])
    
    parsed['term_vector'] = termVector

    return parsed


In [20]:
query = "kimchi"
skg_response = traverse_skg(query)
parse_skg_response(skg_response)

{'category': 'Korean',
 'term_vector': 'kimchi^0.9193 korean^0.7069 banchan^0.6593'}

In [21]:
def skg(query):
    return parse_skg_response(traverse_skg(query))

other_queries = ["bbq", "korean bbq", "lasagna", "karaoke", "drive through"]
for query in other_queries: print(f"{query}: {skg(query)}")


bbq: {'category': 'Barbeque', 'term_vector': 'bbq^0.9191 ribs^0.6186 pork^0.5991'}
korean bbq: {'category': 'Korean', 'term_vector': 'bbq^0.9052 korean^0.8641 pork^0.6079'}
lasagna: {'category': 'Italian', 'term_vector': 'lasagna^0.9193 alfredo^0.3992 pasta^0.3909'}
karaoke: {'category': 'Karaoke', 'term_vector': 'karaoke^0.9193 sing^0.6423 songs^0.5256'}
drive through: {'category': 'Fast Food', 'term_vector': 'through^0.8999 drive^0.8613 thru^0.6118'}


### Listing 7.11

In [22]:
# %load -s process_query_tree ../webserver/semantic_search/process_query_tree
def process_query_tree(query_tree):
    query_tree = process_semantic_functions(query_tree)
        
    # Now process everything that is not yet resolved
    for position in range(len(query_tree)):
        item = query_tree[position];         
        if (item["type"] != "solr"): #already resolved
            if (item["type"] == "keyword"):  
                categoryAndTermVector = None
                skgResponse = traverse_skg(item["surface_form"])
                categoryAndTermVector = parse_skg_response(skgResponse)       

                queryString = ""
                if ("term_vector" in categoryAndTermVector):
                    queryString = categoryAndTermVector["term_vector"]
                
                if ("category" in categoryAndTermVector):
                    if (len(queryString) > 0):
                        queryString += " "
                        queryString += "+doc_type:\"" + categoryAndTermVector["category"] + "\""
                    
                if (len(queryString) == 0):
                    queryString = item["surface_form"] #just keep the input as a keyword

                query_tree[position] = { "type":"solr", "query": "+{!edismax v=\"" + escape_quotes_in_query(queryString) + "\"}" }              
            elif (item["type"] == "color"):
                solrQuery = "+colors_s:\"" + item["canonical_form"] + "\""
                query_tree[position] = {"type":"solr", "query": solrQuery}
            elif (item["type"] == "known_item" or item["type"] == "city" or item["type"] == "event"):
                solrQuery = "+name_s:\"" + item["canonical_form"] + "\""
                query_tree[position] = {"type":"solr", "query": solrQuery}
            elif item["type"] == "city":
                solrQuery = "+city_t:\"" + str(item["name"]) + "\"" 
                query_tree[position] = {"type":"solr", "query": solrQuery}
            elif (item["type"] == "brand"):
                solrQuery = "+brand_s:\"" + item["canonical_form"] + "\""
                query_tree[position] = {"type":"solr", "query": solrQuery}
            else:
                print(item["type"])
                query_tree[position] = {"type":"solr", "query": "+{!edismax v=\"" + escape_quotes_in_query(item["surface_form"]) + "\"}"}              
                
    return query_tree


In [23]:
query = "good kimchi in charlotte"
tagger_data = json.loads(tag_query(bytes(query, "utf-8")))
query_tree, tagged_query, parsed_query = generate_query_representations(query, tagger_data)
processed_query_tree = process_query_tree(query_tree)
processed_query_tree

[{'type': 'solr', 'query': '+{!func v="mul(if(stars_i,stars_i,0),20)"}'},
 {'type': 'solr',
  'query': '+{!edismax v="kimchi^0.9193 korean^0.7069 banchan^0.6593 +doc_type:\\"Korean\\""}'},
 {'type': 'solr',
  'query': '+{!geofilt d=50 sfield="location_p" pt="35.22709,-80.84313"}'}]

### Listing 7.12

In [24]:
# %load -s query_tree_to_resolved_query ../webserver/semantic_search/query_tree/query_tree_to_resolved_query
def query_tree_to_resolved_query(query_tree):
    resolved_query = ""
    for i in range(len(query_tree)):
        if (len(resolved_query) > 0):
            resolved_query += " "
        
        resolved_query += query_tree[i]['query']
        
    return resolved_query


In [25]:
### TODO: the following is added to the manuscript. Should decouple query_tree generation so it can be passed in and just work
#collection = engine.get_collection("reviews")
query_tree_to_resolved_query(query_tree)

#searchResults = collection.search(query_tree)

'+{!func v="mul(if(stars_i,stars_i,0),20)"} +{!edismax v="kimchi^0.9193 korean^0.7069 banchan^0.6593 +doc_type:\\"Korean\\""} +{!geofilt d=50 sfield="location_p" pt="35.22709,-80.84313"}'

### Figure 7.8

In [26]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=good+kimchi+near+charlotte&submit=true" width=100% height="800"></iframe>

In [27]:
#Cleanup so webserver doesn't keep running after you're done
stop_running_webservers()

Stopping webserver (pid: 45351)


## Success!

Up next: Chapter 8 - [Signals Boosting Models](../ch08/1.signals-boosting.ipynb)