# Semantic Search Application

In [1]:
import sys
sys.path.append('..')
sys.path.append('../webserver')
from aips import *
import json

## Starting the Reviews Search Web Server and Launching the Search Page

In [3]:
def get_running_webservers():
    already_running_webservers = ! ps -ef | grep '[s]tart-webserver.py' | awk '{print $2}'
    return already_running_webservers
    
def stop_running_webservers():
    already_running_webservers = get_running_webservers()
    for pid in already_running_webservers:
        print("Stopping webserver (pid: " + pid + ")")
        results = ! xargs kill -9 {pid}

def start_reviews_search_webserver():
    stop_running_webservers() #in case it was already running
    get_ipython().system = os.system
    ! cd ../webserver && python start-webserver.py &
    if len(get_running_webservers()) > 0:
        print("Successfully Started Webserver (pid: " + get_running_webservers()[0] + ")!")

### Listing 7.2

In [4]:
#Start the web server
start_reviews_search_webserver()

Successfully Started Webserver (pid: 13117)!


In [5]:
%%html
<iframe src="http://localhost:2345/search" width=100% height="800"></iframe>


### Figure 7.3

In [6]:
%%html
<iframe src="http://localhost:2345/search?q=bbq+near+charlotte" width=100% height="800"></iframe>

### Figure 7.4

In [7]:
%%html
<iframe src="http://localhost:2345/search?q=bbq+charlotte" width=100% height="800"></iframe>

### Figure 7.5

In [8]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=top+kimchi+near+charlotte&submit=false" width=100% height="800"></iframe>

### Figure 7.6

In [9]:
%%html
<iframe src="http://localhost:2345/search?q=top+kimchi+near+charlotte&submit=false" width=100% height="800"></iframe>

### Figure 7.7

In [10]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=top+kimchi+near+charlotte&submit=true" width=100% height="800"></iframe>

### Listing 7.3

In [11]:
!! cat ../data/reviews/entities.csv

['id,surface_form,canonical_form,type,popularity,semantic_function',
 '1,near,{location_distance},semantic_function,90,"location_distance(query, position)"',
 '2,in,{location_distance},semantic_function,100,"location_distance(query, position)"',
 '3,by,{location_distance},semantic_function,90,"location_distance(query, position)"',
 '4,by,{text_within_one_edit_distance},semantic_function,10,"text_within_one_edit_distance(query, ',
 'position)"',
 '5,near,{text_distance},semantic_function,10,"text_distance(query, position)"',
 '6,popular,{popular},semantic_function,100,"popularity(query, position)"',
 '7,top,{popular},semantic_function,100,"popularity(query, position)"',
 '8,best,{popular},semantic_function,100,"popularity(query, position)"',
 '9,good,{popular},semantic_function,100,"popularity(query, position)"',
 '10,violet,violet,color,100,',
 '11,violet crowne,violet crowne,brand,100,',
 '12,violet crowne charlottesville,violet crowne charlottesville,movie_theater,100,',
 '13,violet 

### Listing 7.6

In [12]:
# %load -s tag_query ../webserver/semantic_search/engine/tag_query
def tag_query(post_body):
    return requests.post(SOLR_URL + '/entities/tag?json.nl=map&sort=popularity%20desc&matchText=true&echoParams=all&fl=id,type,canonical_form,name,country:countrycode_s,admin_area:admin_code_1_s,popularity,*_p,semantic_function', post_body).text


In [13]:
json.loads(tag_query("top kimchi near charlotte"))

{'responseHeader': {'status': 0,
  'QTime': 1,
  'params': {'sort': 'popularity desc',
   'matchText': 'true',
   'json.nl': 'map',
   'field': 'name_tag',
   'echoParams': 'all',
   'fl': 'id,type,canonical_form,name,country:countrycode_s,admin_area:admin_code_1_s,popularity,*_p,semantic_function'}},
 'tagsCount': 3,
 'tags': [{'startOffset': 0, 'endOffset': 3, 'matchText': 'top', 'ids': ['7']},
  {'startOffset': 11, 'endOffset': 15, 'matchText': 'near', 'ids': ['1', '5']},
  {'startOffset': 16,
   'endOffset': 25,
   'matchText': 'charlotte',
   'ids': ['4460243', '4612828', '4680560', '4988584', '5234793']}],
 'response': {'numFound': 8,
  'start': 0,
  'numFoundExact': True,
  'docs': [{'id': '1',
    'canonical_form': '{location_distance}',
    'type': 'semantic_function',
    'popularity': 90,
    'semantic_function': 'location_distance(query, position)'},
   {'id': '5',
    'canonical_form': '{text_distance}',
    'type': 'semantic_function',
    'popularity': 10,
    'semantic_

### Listing 7.7

In [14]:
# %load ../webserver/semantic_search/process_semantic_query
import sys
sys.path.append('..')
sys.path.append('../webserver')
from aips import *
import requests, json
from semantic_search.engine.tag_query import *
from semantic_search.resolve_query import *
from semantic_search.query_tree.query_tree_to_resolved_query import *

def process_semantic_query(query_bytes):
    text = query_bytes.decode('UTF-8')
    data = tag_query(query_bytes)
    tagged_response = json.loads(data)

    #loop through all documents (entities) returned
    doc_map={} # reset to empty
    if (tagged_response['response'] and tagged_response['response']['docs']):

        docs = tagged_response['response']['docs']

        for doc in docs:
            doc_map[doc['id']] = doc

        #for (d=0; d<Object.keys(docs).length; d++) {
        #  let doc = docs[d];
        #  doc_map[doc.id]=doc;
        #}
        
        #sort doc_map by popularity so first most popular always wins
        #def popularity_sort(doc_a, doc_b){
        #  return a.popularity - b.popularity;
        #}
        
        #//doc_map.sort(popularity_sort);
      #}

    query_tree = []
    tagged_query = ""
    transformed_query =""
      
    if (tagged_response['tags'] is not None):
        tags = tagged_response['tags'] 
        #//var lastStart = 0;
        lastEnd = 0
        metaData = {}
        for tag in tags:                
            #tag = tags[key]
            matchText = tag['matchText']
            

            doc_ids = tag['ids']          
            
            #pick top-ranked docid
            best_doc_id = None

            for doc_id in doc_ids:
                if (best_doc_id):
                    if (doc_map[doc_id]['popularity'] > doc_map[best_doc_id]['popularity']):
                        best_doc_id = doc_id
                else:
                    best_doc_id = doc_id


            best_doc = doc_map[best_doc_id]

            #store the unknown text as keywords
            nextText = text[lastEnd:tag['startOffset']].strip()
            if (len(nextText) > 0):  #not whitespace
                query_tree.append({ "type":"keyword", "known":False, "surface_form":nextText, "canonical_form":nextText })          
                tagged_query += " " + nextText
                transformed_query += " " + "{ type:keyword, known: false, surface_form: \"" + nextText + "\"}" 
            
            
            # store the known entity as entity
            query_tree.append(best_doc)  #this is wrong. Need the query tree to have _all_
            # interpretations available and then loop through them to resolve. TODO = fix this.

            tagged_query += " {" + matchText + "}"          
            #//transformed_query += " {type: " + best_doc.type + ", canonical_form: \"" + best_doc.canonical_form + "\"}";  
            transformed_query += json.dumps(best_doc)             
            lastEnd = tag['endOffset'] 
        

        
        if (lastEnd < len(text)):
            finalText = text[lastEnd:len(text)].strip()
            if (len(finalText) > 0):
                query_tree.append({ "type":"keyword", "known":False, "surface_form":finalText, "canonical_form":finalText })
                
                tagged_query += " " + finalText
                transformed_query += " " + "{ type:keyword, known: false, surface_form: \"" + finalText + "\"}" 
                  


    #finalquery = {"query_tree": query_tree}
    #let query = {query_tree: query_tree}; //so we can pass byref        
        
    final_query = resolve_query(query_tree)
    #if (query != null){ //short circuit if new request has been issued
    resolved_query = query_tree_to_resolved_query(query_tree)      
                    
            #UI.updateResolvedQuery(resolved_query)
        #}

    response = {
        "tagged_query": tagged_query,
        "transformed_query": transformed_query,
        "resolved_query": resolved_query,
        "tagger_data": tagged_response
    }

    return response

### Listing 7.8

In [15]:
# %load ../webserver/semantic_search/semantic_functions/popularity
def popularity(query, position):
    if (len(query['query_tree']) -1 > position):
        query['query_tree'][position] = {"type":"solr", "query": '+{!func v="mul(if(stars_i,stars_i,0),20)"}'}
        return True
    else:
        return False

### Listing 7.9

In [16]:
# %load -s location_distance,create_geo_filter ../webserver/semantic_search/semantic_functions/location_distance
def location_distance(query, position):      
    if (len(query['query_tree']) -1 > position):
        nextEntity = query['query_tree'][position + 1]
        if (nextEntity['type'] == "city"):
        
            query['query_tree'].pop(position + 1);
            query['query_tree'][position] = {"type":"solr", 
                                             "query": create_geo_filter(nextEntity['location_p'], 
                                             "location_p", 50)}
            return True
    return False

def create_geo_filter(coordinates, field, distanceInKM):
    return "+{!geofilt d=" + str(distanceInKM) + " sfield=\"" + field + "\" pt=\"" + coordinates + "\"}"

### Listing 7.10

In [17]:
# %load -s structured_search ../webserver/semantic_search/engine/structured_search
def structured_search(json_query):
    x = json.dumps(json_query)
    return requests.post(SOLR_URL + '/reviews/select', json=json_query).text

In [18]:
# %load -s get_category_and_term_vector_solr_response ../webserver/semantic_search/engine/get_category_and_term_vector_solr_response
def get_category_and_term_vector_solr_response(keyword):
    query = {
        "params": { "fore": keyword, "back": "*:*", "df": "text_t" },
        "query": "*:*", "limit": 0,
        "facet": {
            "term_needing_vector": {
                "type": "query", "query": keyword,
                "facet": {
                    "related_terms" : {
                        "type" : "terms", "field" : "text_t", "limit": 3, "sort": { "r1": "desc" },
                        "facet" : { "r1" : "relatedness($fore,$back)" }},
                    "doc_type" : {
                        "type" : "terms", "field" : "doc_type", "limit": 1, "sort": { "r2": "desc" },
                        "facet" : { "r2" : "relatedness($fore,$back)"  }}}}}}

    response = structured_search(query)
    return json.loads(response)

In [19]:
get_category_and_term_vector_solr_response("kimchi")

{'responseHeader': {'zkConnected': True,
  'status': 0,
  'QTime': 69,
  'params': {'json': '{"params": {"fore": "kimchi", "back": "*:*", "df": "text_t"}, "query": "*:*", "limit": 0, "facet": {"term_needing_vector": {"type": "query", "query": "kimchi", "facet": {"related_terms": {"type": "terms", "field": "text_t", "limit": 3, "sort": {"r1": "desc"}, "facet": {"r1": "relatedness($fore,$back)"}}, "doc_type": {"type": "terms", "field": "doc_type", "limit": 1, "sort": {"r2": "desc"}, "facet": {"r2": "relatedness($fore,$back)"}}}}}}'}},
 'response': {'numFound': 192140,
  'start': 0,
  'numFoundExact': True,
  'docs': []},
 'facets': {'count': 192140,
  'term_needing_vector': {'count': 193,
   'related_terms': {'buckets': [{'val': 'kimchi',
      'count': 193,
      'r1': {'relatedness': 0.91934,
       'foreground_popularity': 0.001,
       'background_popularity': 0.001}},
     {'val': 'korean',
      'count': 95,
      'r1': {'relatedness': 0.7069,
       'foreground_popularity': 0.0004

### Listing 7.11

In [20]:
# %load -s resolve_query ../webserver/semantic_search/resolve_query
def resolve_query(query_tree):
    query_tree = process_semantic_functions(query_tree)
        
    # Now process everything that is not yet resolved
    for position in range(len(query_tree)):
        item = query_tree[position];         
        if (item["type"] != "solr"): #already resolved
            if (item["type"] == "keyword"):  
                categoryAndTermVector = None
                solrResponse = get_category_and_term_vector_solr_response(item["surface_form"])
                categoryAndTermVector = parse_category_and_term_vector_from_solr_response(solrResponse)       

                queryString = ""
                if ("term_vector" in categoryAndTermVector):
                    queryString = categoryAndTermVector["term_vector"]
                
                if ("category" in categoryAndTermVector):
                    if (len(queryString) > 0):
                        queryString += " "
                        queryString += "+doc_type:\"" + categoryAndTermVector["category"] + "\""
                    
                if (len(queryString) == 0):
                    queryString = item["surface_form"] #just keep the input as a keyword

                query_tree[position] = { "type":"solr", "query": "+{!edismax v=\"" + escape_quotes_in_query(queryString) + "\"}" }              
            elif (item["type"] == "color"):
                solrQuery = "+colors_s:\"" + item["canonical_form"] + "\""
                query_tree[position] = {"type":"solr", "query": solrQuery}
            elif (item["type"] == "known_item" or item["type"] == "city" or item["type"] == "event"):
                solrQuery = "+name_s:\"" + item["canonical_form"] + "\""
                query_tree[position] = {"type":"solr", "query": solrQuery}
            elif item["type"] == "city":
                solrQuery = "+city_t:\"" + str(item["name"]) + "\"" 
                query_tree[position] = {"type":"solr", "query": solrQuery}
            elif (item["type"] == "brand"):
                solrQuery = "+brand_s:\"" + item["canonical_form"] + "\""
                query_tree[position] = {"type":"solr", "query": solrQuery}
            else:
                print(item["type"])
                query_tree[position] = {"type":"solr", "query": "+{!edismax v=\"" + escape_quotes_in_query(item["surface_form"]) + "\"}"}              
                
    return query_tree


### Listing 7.12

In [21]:
# %load -s query_tree_to_resolved_query ../webserver/semantic_search/query_tree/query_tree_to_resolved_query
def query_tree_to_resolved_query(query_tree):
    resolved_query = ""
    for i in range(len(query_tree)):
        if (len(resolved_query) > 0):
            resolved_query += " "
        
        resolved_query += query_tree[i]['query']
        
    return resolved_query


### Figure 7.8

In [22]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=good+kimchi+near+charlotte&submit=true" width=100% height="800"></iframe>

In [23]:
#Cleanup so webserver doesn't keep running after you're done
stop_running_webservers()

Stopping webserver (pid: 13117)


## Success!

Up next: Chapter 8 - [Signals Boosting Models](../ch08/1.signals-boosting.ipynb)