# [ Chapter 6 - Using Content to Learn Domain-specific Language ]
# Query Classification and Disambiguation with Semantic Knowledge Graphs

In [1]:
import json
from aips import get_engine
from aips.spark import get_spark_session
from aips import get_semantic_knowledge_graph as get_skg
import aips.indexer

spark = get_spark_session()
engine = get_engine()
aips.indexer.build_collection(engine, "stackexchange")

Wiping "stackexchange" collection
Creating "stackexchange" collection
Status: Success
Loading data/health/posts.csv
Schema: 
root
 |-- post_type_id: integer (nullable = true)
 |-- accepted_answer_id: integer (nullable = true)
 |-- parent_id: integer (nullable = true)
 |-- creation_date: timestamp (nullable = true)
 |-- deletion_date: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- view_count: integer (nullable = true)
 |-- body: string (nullable = true)
 |-- owner_user_id: integer (nullable = true)
 |-- owner_display_name: string (nullable = true)
 |-- last_editor_user_id: integer (nullable = true)
 |-- last_editor_display_name: string (nullable = true)
 |-- last_edit_date: timestamp (nullable = true)
 |-- last_activity_date: timestamp (nullable = true)
 |-- title: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- answer_count: integer (nullable = true)
 |-- comment_count: integer (nullable = true)
 |-- favorite_count: integer (nullable = true)
 |-- 

<engines.solr.SolrCollection.SolrCollection at 0x7fd4d54409a0>

## Query Classification

## Listing 6.1

In [2]:
def print_classifications(query, traversal):
  classifications = traversal["graph"][0]["values"][query]["traversals"][0]["values"]
  print(f"Query: {query}") 
  print("  Classifications:")
  for term, data in classifications.items():
      print(f'    {term}  {data["relatedness"]}')
  print()

In [3]:
def print_query_classification(query, classification_field="category", 
      classification_limit=5, keywords_field="body", min_occurrences=5):
    
    nodes_to_traverse = [{"field": keywords_field,
                          "values": [query]},
                         {"field": classification_field,
                          "min_occurrences": min_occurrences,
                          "limit": classification_limit}]
    
    traversal = skg.traverse(*nodes_to_traverse)
    print_classifications(query, traversal)

skg = get_skg(engine.get_collection("stackexchange"))

print_query_classification("docker", classification_limit=3)
print_query_classification("airplane", classification_limit=1)
print_query_classification("airplane AND crash", classification_limit=2)
print_query_classification("vitamins", classification_limit=2)
print_query_classification("alien", classification_limit=1)
print_query_classification("passport", classification_limit=1)
print_query_classification("driver", classification_limit=2)
print_query_classification("driver AND taxi", classification_limit=2)
print_query_classification("driver AND install", classification_limit=2)

Query: docker
  Classifications:

Query: airplane
  Classifications:
    travel  0.32742

Query: airplane AND crash
  Classifications:
    scifi  0.02024
    travel  0.00384

Query: vitamins
  Classifications:
    health  0.48265
    cooking  0.09135

Query: alien
  Classifications:
    scifi  0.61575

Query: passport
  Classifications:
    travel  0.82611

Query: driver
  Classifications:
    travel  0.40174
    health  -0.05868

Query: driver AND taxi
  Classifications:
    travel  0.23735
    scifi  -0.14073

Query: driver AND install
  Classifications:
    travel  0.03986



## Disambiguation

## Listing 6.2

In [4]:
def print_disambigutaions(query, traversal):
    classifications = traversal["graph"][0]["values"][query]["traversals"][0]["values"]
    
    print(f"Query: {query}") 
    for context, data in classifications.items():
        print(f'  Context: {context}  {data["relatedness"]}')
        print("    Keywords: ")
        for keyword, keyword_data in data["traversals"][0]["values"].items():
            print(f'      {keyword}  {keyword_data["relatedness"]}')
        print()

def print_query_disambigutaion(query,
      context_field="category", context_limit=5,
      keywords_field="body", keywords_limit=10, min_occurrences=5):
    
    nodes_to_traverse = [{"field": keywords_field,
                          "values": [query]},
                         {"field": context_field,
                          "min_occurrences": min_occurrences, 
                          "limit": context_limit},
                         {"field": keywords_field,
                          "min_occurrences": min_occurrences, 
                          "limit": keywords_limit}]
    
    traversal = skg.traverse(*nodes_to_traverse)
    print_disambigutaions(query, traversal)

## Listing 6.3

In [5]:
print_query_disambigutaion("server")
print_query_disambigutaion("driver", context_limit=2)
print_query_disambigutaion("chef", context_limit=2)

Query: server
  Context: travel  0.06808
    Keywords: 
      server  0.9178
      servers  0.63006
      tipping  0.53918
      vpn  0.51253
      firewall  0.44784
      tip  0.4086
      ip  0.22201
      encrypted  0.21787
      restaurant  0.21428
      proxy  0.20573

  Context: cooking  -0.02579
    Keywords: 
      server  0.85633
      restaurant  0.16297
      pie  0.12743
      served  0.12008
      restaurants  0.11547
      knife  0.10688
      pieces  0.10067
      serve  0.08914
      staff  0.08776
      dish  0.0845

  Context: scifi  -0.03825
    Keywords: 
      server  0.90455
      flynn's  0.52892
      servers  0.44906
      networking  0.3657
      computer  0.28046
      computers  0.25857
      flynn  0.24637
      shutdown  0.24159
      grid  0.23718
      hacker  0.19523

Query: driver
  Context: travel  0.38996
    Keywords: 
      driver  0.93417
      drivers  0.76932
      taxi  0.71977
      car  0.65572
      license  0.61319
      driving  0.60849
  

## Listing 6.4
#### An SKG disambiguation request for the query `chef`

In [6]:
def print_disambigutaion_request(query, context_field="category", context_limit=5,
      keywords_field="body", keywords_limit=10, min_occurrences=5):
    
    nodes_to_traverse = [{"field": keywords_field, "values": [query]},
                         {"field": context_field,
                          "min_occurrences": min_occurrences, 
                          "limit": context_limit},
                         {"field": keywords_field,
                          "min_occurrences": min_occurrences, 
                          "limit": keywords_limit}]
    
    print(json.dumps(skg.transform_request(*nodes_to_traverse), indent=2))

In [7]:
print_disambigutaion_request("chef", context_limit=2)

{
  "limit": 0,
  "params": {
    "q": "*:*",
    "fore": "{!${defType} v=$q}",
    "back": "*:*",
    "defType": "edismax",
    "f0_0_query": "chef"
  },
  "facet": {
    "f0_0": {
      "type": "query",
      "sort": {
        "relatedness": "desc"
      },
      "facet": {
        "relatedness": {
          "type": "func",
          "func": "relatedness($fore,$back)"
        },
        "f1_0": {
          "type": "terms",
          "limit": 2,
          "sort": {
            "relatedness": "desc"
          },
          "facet": {
            "relatedness": {
              "type": "func",
              "func": "relatedness($fore,$back)"
            },
            "f2_0": {
              "type": "terms",
              "limit": 10,
              "sort": {
                "relatedness": "desc"
              },
              "facet": {
                "relatedness": {
                  "type": "func",
                  "func": "relatedness($fore,$back)"
                }
              },

## Success!

You've leveraged a semantic knowledge graph to find related terms for a query, performed query expansion based upon semantically-similar terms, explored multiple different way to impact precision and recall of queries through integrating semantically-augmented queries, generated content-based recommendations leveraging a semantic knowledge graph, explored arbitrary relationship types by traversing a semantic knowledge graph, and performed both query classification and query disambiguration using a semantic knowledge graph.

Semantic knowledge graphs can be a powerful tool for understaning user intent and interpreting both queries and content based upon meaning instead of just text kewords.

Up next: [Related Keyword Detection from Signals](../ch06/2.related-keywords-from-signals.ipynb)