# Semantic Search Application

In [1]:
import os

from aips import get_engine, get_entity_extractor, get_semantic_engine, get_semantic_knowledge_graph, get_sparse_semantic_search
from engines.Engine import AdvancedFeatures
from semantic_search import *
from semantic_search.query_tree import *
import aips.indexer 
import inspect

from aips import set_engine; set_engine("weaviate")
engine = get_engine()

entities_collection = aips.indexer.build_collection(engine, "entities", force_rebuild=True, log=True)
reviews_collection = aips.indexer.build_collection(engine, "reviews", force_rebuild=True, log=True)
sparse_semantic = get_sparse_semantic_search()

Reindexing [entities] collection
Wiping "entities" collection
Creating "entities" collection
Schema: {
  "class": "entities",
  "properties": [
    {
      "name": "__id",
      "dataType": [
        "text"
      ]
    },
    {
      "name": "surface_form",
      "dataType": [
        "text"
      ]
    },
    {
      "name": "canonical_form",
      "dataType": [
        "text"
      ]
    },
    {
      "name": "type",
      "dataType": [
        "text"
      ]
    },
    {
      "name": "popularity",
      "dataType": [
        "int"
      ]
    },
    {
      "name": "semantic_function",
      "dataType": [
        "text"
      ]
    },
    {
      "name": "location_coordinates",
      "dataType": [
        "text"
      ]
    },
    {
      "name": "country",
      "dataType": [
        "text"
      ]
    },
    {
      "name": "admin_area",
      "dataType": [
        "text"
      ]
    }
  ]
}
Status: <Response [200]>
Response: {'class': 'Entities', 'invertedIndexConfig': {'bm25':

100%|██████████| 8.0/8.0 [00:06<00:00,  1.33it/s]    


Extracting [data/repositories/reviews/entities.tgz] to [data/reviews]
Loading data/reviews/entities.csv
Schema: 
root
 |-- id: integer (nullable = true)
 |-- surface_form: string (nullable = true)
 |-- canonical_form: string (nullable = true)
 |-- type: string (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- semantic_function: string (nullable = true)

Successfully written 21 documents
Successfully written 21 documents
File [data/reviews/cities.csv] exists? False
Pulling existing repo [data/repositories/reviews]
Extracting [data/repositories/reviews/cities.tgz] to [data/reviews]
Loading Geonames...
Successfully written 137581 documents
Successfully written 137581 documents
Reindexing [reviews] collection
Wiping "reviews" collection
Creating "reviews" collection
Schema: {
  "class": "reviews",
  "properties": [
    {
      "name": "__id",
      "dataType": [
        "text"
      ]
    },
    {
      "name": "business_name",
      "dataType": [
        "text"
      ]
   

### Listing 7.2 & Figure 7.2
<a id='listing-7.2'></a>

In [2]:
def get_running_webservers():
    already_running_webservers = ! ps -ef | grep '[s]tart-webserver.py' | awk '{print $2}'
    return already_running_webservers
    
def stop_running_webservers():
    already_running_webservers = get_running_webservers()
    for pid in already_running_webservers:
        print("Stopping webserver (pid: " + pid + ")")
        results = ! xargs kill -9 {pid}

def start_reviews_search_webserver():
    stop_running_webservers() #in case it was already running
    get_ipython().system = os.system
    ! cd webserver && python start-webserver.py &
    if len(get_running_webservers()) > 0:
        print("Successfully Started Webserver (pid: " + get_running_webservers()[0] + ")!")

In [3]:
start_reviews_search_webserver()

Successfully Started Webserver (pid: 1795)!


In [4]:
%%html
<iframe src="http://localhost:2345/search" width=100% height="800"></iframe>

### Figure 7.3

In [5]:
%%html
<iframe src="http://localhost:2345/search?q=bbq+near+charlotte" width=100% height="800"></iframe>

### Figure 7.4

In [6]:
%%html
<iframe src="http://localhost:2345/search?q=bbq+charlotte" width=100% height="800"></iframe>

### Figure 7.5

In [7]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=top+kimchi+near+charlotte&submit=true" width=100% height="800"></iframe>

### Figure 7.6

In [8]:
%%html
<iframe src="http://localhost:2345/search?q=top+kimchi+near+charlotte&submit=false" width=100% height="800"></iframe>

### Figure 7.7

In [9]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=top+kimchi+near+charlotte&submit=true" width=100% height="800"></iframe>

In [10]:
stop_running_webservers()

Stopping webserver (pid: 1795)


### Listing 7.3 : [located here](1.index-datasets.ipynb#listing_7_3).
### Listing 7.4 : [located here](../ch07/1.index-datasets.ipynb#Listing-7.4).
### Listing 7.5 : [located here](../ch07/1.index-datasets.ipynb#Listing-7.5).

### Listing 7.6

In [11]:
query = "top kimchi near charlotte"
entities_collection = engine.get_collection("entities")
extractor = get_entity_extractor(entities_collection)
query_entities = extractor.extract_entities(query)
display(query_entities)

{'query': 'top kimchi near charlotte',
 'tags': [{'startOffset': 0, 'endOffset': 3, 'matchText': 'top', 'ids': ['7']},
  {'startOffset': 11, 'endOffset': 15, 'matchText': 'near', 'ids': ['1', '5']},
  {'startOffset': 16,
   'endOffset': 25,
   'matchText': 'charlotte',
   'ids': ['a86e45ca-90e3-4eab-8b03-ddb73e943e04',
    '3e22c456-b5f8-46da-9d46-7a2dbef882f3',
    '19700a21-0c25-4a30-bd52-b0813e6dae0f',
    '5f054a5a-8bf1-4072-ae52-da81a8f84de0',
    'c80fcbb2-8c6c-484c-813b-3555b95e81f9']}],
 'entities': [{'semantic_function': 'location_distance(query, position)',
   'popularity': 90,
   'id': '1',
   'surface_form': 'near',
   'type': 'semantic_function',
   'canonical_form': '{location_distance}'},
  {'semantic_function': 'text_distance(query, position)',
   'popularity': 10,
   'id': '5',
   'surface_form': 'near',
   'type': 'semantic_function',
   'canonical_form': '{text_distance}'},
  {'semantic_function': 'popularity(query, position)',
   'popularity': 100,
   'id': '7',
   

### Listing 7.7


In [12]:
def generate_tagged_query(extracted_entities):
    query = extracted_entities["query"]
    last_end = 0
    tagged_query = ""
    for tag in extracted_entities["tags"]:
        next_text = query[last_end:tag["startOffset"]].strip()
        if len(next_text) > 0:
            tagged_query += " " + next_text
        tagged_query += " {" + tag["matchText"] + "}"
        last_end = tag["endOffset"]
    if last_end < len(query):
        final_text = query[last_end:len(query)].strip()
        if len(final_text):
            tagged_query += " " + final_text
    return tagged_query   

tagged_query = generate_tagged_query(query_entities)
print(tagged_query)

 {top} kimchi {near} {charlotte}


### Listing 7.8

In [13]:
# %load -s generate_query_tree semantic_search/__init__.py
def generate_query_tree(extracted_entities):
    query = extracted_entities["query"]
    entities = {entity["id"]: entity for entity
                in extracted_entities["entities"]}
    query_tree = []    
    last_end = 0
    
    for tag in extracted_entities["tags"]:
        best_entity = entities[tag["ids"][0]]
        for entity_id in tag["ids"]:
            if (entities[entity_id]["popularity"] > 
                best_entity["popularity"]):
                best_entity = entities[entity_id]
        
        next_text = query[last_end:tag["startOffset"]].strip()
        if next_text:
            query_tree.append({"type": "keyword",
                               "surface_form": next_text,
                               "canonical_form": next_text})
        query_tree.append(best_entity)
        last_end = tag["endOffset"]

    if last_end < len(query):
        final_text = query[last_end:len(query)].strip()
        if final_text:
            query_tree.append({"type": "keyword",
                               "surface_form": final_text,
                               "canonical_form": final_text})
    return query_tree

In [14]:
parsed_query = generate_query_tree(query_entities)
display(parsed_query)

[{'semantic_function': 'popularity(query, position)',
  'popularity': 100,
  'id': '7',
  'surface_form': 'top',
  'type': 'semantic_function',
  'canonical_form': '{popular}'},
 {'type': 'keyword', 'surface_form': 'kimchi', 'canonical_form': 'kimchi'},
 {'semantic_function': 'location_distance(query, position)',
  'popularity': 90,
  'id': '1',
  'surface_form': 'near',
  'type': 'semantic_function',
  'canonical_form': '{location_distance}'},
 {'country': 'US',
  'admin_area': 'NC',
  'popularity': 827097,
  'id': 'a86e45ca-90e3-4eab-8b03-ddb73e943e04',
  'surface_form': 'Charlotte',
  'type': 'city',
  'location_coordinates': '35.22709,-80.84313',
  'canonical_form': 'Charlotte'}]

### Listing 7.9

In [15]:
print(inspect.getsource(sparse_semantic.popularity))

    def popularity(self, query, position):
        if len(query["query_tree"]) -1 > position:
            query["query_tree"][position] = {"type": "transformed",
                                             "syntax": "weaviate",
                                             "query": {"vector_search": {"popularity": [5]}}}
            return True
        return False



### Listing 7.10

In [16]:
print(inspect.getsource(sparse_semantic.location_distance))

    def location_distance(self, query, position):
        if len(query["query_tree"]) -1 > position:
            next_entity = query["query_tree"][position + 1]
            if next_entity["type"] == "city":
                query["query_tree"].pop(position + 1)
                query["query_tree"][position] = {
                    "type": "transformed",
                    "syntax": "weaviate",
                    "query": {"filters": self.create_geo_filter(next_entity['location_coordinates'],
                                                               "location_coordinates", 50)}}
                return True
        return False



### Listing 7.11


In [17]:
# %load -s process_semantic_functions semantic_search/query_tree.py
def process_semantic_functions(query_tree):
    position = 0
    while position < len(query_tree):
        node = query_tree[position]
        if node["type"] == "semantic_function":
            query = {"query_tree": query_tree} 
            command_successful = eval(node["semantic_function"])
            if not command_successful:
                node["type"] = "invalid_semantic_function"
        position += 1
    return query_tree 

### Listing 7.12

In [18]:
# %load -s get_enrichments semantic_search/query_tree.py
def get_enrichments(collection, keyword, limit=4):
    enrichments = {}
    nodes_to_traverse = [{"field": "content",
                          "values": [keyword],
                          "default_operator": "OR"},
                         [{"name": "related_terms",
                           "field": "content",
                           "limit": limit},
                          {"name": "doc_type",
                           "field": "doc_type",
                           "limit": 1}]]
    skg = get_semantic_knowledge_graph(collection)
    traversals = skg.traverse(*nodes_to_traverse)
    if "traversals" not in traversals["graph"][0]["values"][keyword]:
        return enrichments
    
    nested_traversals = traversals["graph"][0]["values"] \
                                  [keyword]["traversals"]
    
    doc_types = list(filter(lambda t: t["name"] == "doc_type",
                            nested_traversals))
    if doc_types:
        enrichments["category"] = next(iter(doc_types[0]["values"]))
        
    related_terms = list(filter(lambda t: t["name"] == "related_terms",
                                nested_traversals))
    if related_terms:
        term_vector = ""
        for term, data in related_terms[0]["values"].items():
            term_vector += f'{term}^{round(data["relatedness"], 4)} '
        enrichments["term_vector"] = term_vector.strip()
    
    return enrichments

In [19]:
query = "kimchi"
get_enrichments(reviews_collection, query)

HTTPError: 404 Client Error: Not Found for url: http://localhost:8983/solr/reviews/select

In [None]:
other_queries = ["bbq", '"korean bbq"', "lasagna", "karaoke", '"drive through"']
for query in other_queries:
    enrichments = get_enrichments(reviews_collection, query)
    print(f"{query}: {enrichments}")

bbq: {'category': 'Barbeque', 'term_vector': 'bbq^0.9191 ribs^0.6187 pork^0.5992 brisket^0.5691'}
"korean bbq": {'category': 'Korean', 'term_vector': 'korean^0.7754 bbq^0.6716 banchan^0.5534 sariwon^0.5211'}
lasagna: {'category': 'Italian', 'term_vector': 'lasagna^0.9193 alfredo^0.3992 pasta^0.3909 italian^0.3742'}
karaoke: {'category': 'Karaoke', 'term_vector': 'karaoke^0.9193 sing^0.6423 songs^0.5256 song^0.4118'}
"drive through": {'category': 'Fast Food', 'term_vector': "drive^0.7428 through^0.6331 mcdonald's^0.2873 window^0.2643"}


### Listing 7.13

In [None]:
# %load -s enrich semantic_search/query_tree.py
def enrich(collection, query_tree):
    query_tree = process_semantic_functions(query_tree)    
    for item in query_tree:
        if item["type"] == "keyword":
            enrichments = get_enrichments(collection, item["surface_form"])
            if enrichments:
                item["type"] = "skg_enriched"
                item["enrichments"] = enrichments
    return query_tree

### Listing 7.14

In [None]:
from spladerunner import Expander
expander = Expander('Splade_PP_en_v1', 128)
queries = ["kimchi", "bbq", "korean bbq", 
           "lasagna", "karaoke", "drive through"]

for query in queries:
  sparse_vec = expander.expand(query,
                               outformat="lucene")[0]
  print(sparse_vec)     

{'kim': 3.11, '##chi': 3.04, 'ki': 1.52, ',': 0.92, 'who': 0.72, 'brand': 0.56, 'genre': 0.46, 'chi': 0.45, '##chy': 0.45, 'company': 0.41, 'only': 0.39, 'take': 0.31, 'club': 0.25, 'species': 0.22, 'color': 0.16, 'type': 0.15, 'but': 0.13, 'dish': 0.12, 'hotel': 0.11, 'music': 0.09, 'style': 0.08, 'name': 0.06, 'religion': 0.01}
{'bb': 2.78, 'grill': 1.85, 'barbecue': 1.36, 'dinner': 0.91, '##q': 0.78, 'dish': 0.77, 'restaurant': 0.65, 'sport': 0.46, 'food': 0.34, 'style': 0.34, 'eat': 0.24, 'a': 0.23, 'genre': 0.12, 'definition': 0.09}
{'korean': 2.84, 'korea': 2.56, 'bb': 2.23, 'grill': 1.58, 'dish': 1.21, 'restaurant': 1.18, 'barbecue': 0.79, 'kim': 0.67, 'food': 0.64, 'dinner': 0.39, 'restaurants': 0.32, 'japanese': 0.31, 'eat': 0.27, 'hotel': 0.16, 'famous': 0.11, 'brand': 0.11, '##q': 0.06, 'diner': 0.02}
{'las': 2.87, '##ag': 2.85, '##na': 2.39, ',': 0.84, 'she': 0.5, 'species': 0.34, 'hotel': 0.33, 'club': 0.31, 'location': 0.3, 'festival': 0.29, 'company': 0.27, 'her': 0.2, '

### Listing 7.15

In [None]:
print(inspect.getsource(sparse_semantic.transform_query))

    def transform_query(self, query_tree):
        for i, item in enumerate(query_tree):
            match item["type"]:
                case "transformed":
                    continue
                case "skg_enriched":
                    enrichments = item["enrichments"]  
                    if "term_vector" in enrichments:
                        query_string = enrichments["term_vector"]
                        transformed_query = query_string
                case _:
                    transformed_query = item["surface_form"].replace('"', '\\"')
            query_tree[i] = {"type": "transformed",
                             "syntax": "weaviate",
                             "query": transformed_query}                 
        return query_tree



In [None]:
def transform_query(enriched_query_tree):
    return get_sparse_semantic_search().transform_query(enriched_query_tree)

In [None]:
query = "top kimchi near Charlotte"
tagger_data = extractor.extract_entities(query)
query_tree = generate_query_tree(tagger_data)
enriched_query_tree = enrich(reviews_collection, query_tree)
processed_query_tree = transform_query(enriched_query_tree)
display(processed_query_tree)

[{'type': 'transformed',
  'syntax': 'weaviate',
  'query': {'vector_search': {'popularity': [5]}}},
 {'type': 'transformed',
  'syntax': 'weaviate',
  'query': 'kimchi^0.9193 korean^0.7069 banchan^0.6593 bulgogi^0.5497'},
 {'type': 'transformed',
  'syntax': 'weaviate',
  'query': {'filter': [Argument(name='operator', value='WithinGeoRange'),
    Argument(name='valueGeoRange', value=[Argument(name='geoCoordinates', value=[Argument(name='latitude', value='35.22709'), Argument(name='longitude', value='-80.84313')]), Argument(name='distance', value=[Argument(name='max', value=50000)])]),
    Argument(name='path', value=['"location_coordinates"'])]}}]

### Listing 7.16

In [None]:
def to_queries(query_tree):
  return [node["query"] for node in query_tree]

queries = to_queries(query_tree)
reviews_collection.search(query=queries, log=True,
                          return_fields=["business_name", "score", "state", "city"])

Search Request:
Get {
  Reviews(
    limit: 10
    bm25: {
      query: "kimchi korean banchan bulgogi"
    }
    where: {
      operator: And
      operands: [
        {
          operator: WithinGeoRange
          valueGeoRange: {
            geoCoordinates: {
              latitude: 35.22709
              longitude: -80.84313
            }
            distance: {
              max: 50000
            }
          }
          path: ["location_coordinates"]
        }
      ]
    }
  ) {
    business_name
    state
    city
    _additional {
      score
    }
  }
}
{'data': {'Get': {'Reviews': [{'_additional': {'score': '10.2406025'}, 'business_name': 'Avenue Market', 'city': 'Charlotte', 'state': 'NC'}, {'_additional': {'score': '3.2472682'}, 'business_name': 'Fujiyama', 'city': 'Charlotte', 'state': 'NC'}, {'_additional': {'score': '3.056887'}, 'business_name': 'Emzy Sushi Bar & Asian Kitchen', 'city': 'Charlotte', 'state': 'NC'}, {'_additional': {'score': '2.1345146'}, 'business_name'

{'docs': [{'business_name': 'Avenue Market',
   'city': 'Charlotte',
   'state': 'NC',
   'score': '10.2406025'},
  {'business_name': 'Fujiyama',
   'city': 'Charlotte',
   'state': 'NC',
   'score': '3.2472682'},
  {'business_name': 'Emzy Sushi Bar & Asian Kitchen',
   'city': 'Charlotte',
   'state': 'NC',
   'score': '3.056887'},
  {'business_name': 'Cantina 1511',
   'city': 'Charlotte',
   'state': 'NC',
   'score': '2.1345146'}]}

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 58736)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.10/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.10/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.10/socketserver.py", line 747, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 281, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 253, in poll
    if func():
  File "/usr/local/spark/python/pyspark/accumulators.py", line 257, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/local/spark/python/

In [None]:
start_reviews_search_webserver()

Successfully Started Webserver (pid: 18802)!


In [None]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=good+kimchi+near+charlotte&submit=true" width=100% height="800"/>

In [None]:
#Cleanup so webserver doesn't keep running after you're done
stop_running_webservers()

Stopping webserver (pid: 18802)


## Success!
Up next: Chapter 8 - [Signals Boosting Models](../ch08/1.signals-boosting.ipynb)