# [ Chapter 6 - Using Content to Learn Domain-specific Language ]
# Query Classification and Disambiguation with Semantic Knowledge Graphs

NOTE: This notebook depends upon the the Stack Exchange datasets. If you have any issues, please rerun the [Setting up the Stack Exchange Dataset](../ch05/2.index-datasets.ipynb) notebook.

In [1]:
import sys
sys.path.append('..')
from aips import *
import os
import json
from IPython.display import display,HTML
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("AIPS").getOrCreate()
engine = get_engine()

## Query Classification

In [2]:
collection = "stackexchange"

## Listing 6.1

In [3]:
def run_query_classification(query, classification_field="category", 
                             classification_limit=5, keywords_field="body", min_occurrences=5):
        
    classification_query = {
        "params": {
            "qf": keywords_field,
            "fore": "{!type=$defType qf=$qf v=$q}",
            "back": "*:*",
            "defType": "edismax",
            "rows": 0,
            "echoParams": "none",
            "omitHeader": "true"
        },
        "query": query,
        "facet": {
            "classification":{
                "type": "terms",
                "field": classification_field,
                "sort": { "classification_relatedness": "desc"},
                "mincount": min_occurrences, 
                "limit": classification_limit,
                "facet": {
                    "classification_relatedness": {
                        "type": "func",
                        "func": "relatedness($fore,$back)"
                    }
                }
            }
        }
    }
     
    search_results = engine.search(collection, classification_query).json()
    print(f"Query: {query}") 
    print("  Classifications: ")
    for bucket in search_results["facets"]["classification"]["buckets"]:
        print(f"    {bucket['val']}  {bucket['classification_relatedness']['relatedness']}")
    print("\n")

run_query_classification("docker", classification_limit=3 )
run_query_classification("airplane", classification_limit=1 )
run_query_classification("airplane AND crash", classification_limit=2 )
run_query_classification("camping", classification_limit=2 )
run_query_classification("alien", classification_limit=1 )
run_query_classification("passport", classification_limit=1 )
run_query_classification("driver", classification_limit=2 )
run_query_classification("driver AND taxi", classification_limit=2 )
run_query_classification("driver AND install", classification_limit=2 )

Query: docker
  Classifications: 
    devops  0.88257


Query: airplane
  Classifications: 
    travel  0.33525


Query: airplane AND crash
  Classifications: 
    scifi  0.02399
    travel  0.0066


Query: camping
  Classifications: 
    outdoors  0.71621
    travel  0.01494


Query: alien
  Classifications: 
    scifi  0.64359


Query: passport
  Classifications: 
    travel  0.83413


Query: driver
  Classifications: 
    travel  0.3975
    devops  0.09238


Query: driver AND taxi
  Classifications: 
    travel  0.25059
    scifi  -0.13167


Query: driver AND install
  Classifications: 
    devops  0.22399
    travel  -0.00634




## Disambiguation

## Listing 6.2

In [4]:
def run_disambiguation_query(query, context_field="category", context_limit=5,
                             keywords_field="body", keywords_limit=10, min_occurrences=5):      
    
    disambiguation_query = {
        "params": {
            "qf": keywords_field,
            "fore": "{!type=$defType qf=$qf v=$q}",
            "back": "*:*",
            "defType": "edismax",
            "rows": 0,
            "echoParams": "none",
            "omitHeader": "true"
        },
        "query": query,
        "facet": {
            "context":{
                "type": "terms",
                "field": context_field,
                "sort": { "context_relatedness": "desc"},
                "mincount": min_occurrences, 
                "limit": context_limit,
                "facet": {
                    "context_relatedness": {
                        "type": "func",
                        "func": "relatedness($fore,$back)"
                    },        
                    "keywords": {
                        "type": "terms",
                        "field": keywords_field,
                        "mincount": min_occurrences,
                        "limit": keywords_limit,
                        "sort": { "keywords_relatedness": "desc"},
                        "facet": {
                            "keywords_relatedness": {
                                "type": "func",
                                "func": "relatedness($fore,$back)"
                            }
                        }
                    }
                }
            }
        }
    }    
        
    search_results = engine.search(collection, disambiguation_query).json()
    
    print(f"Query: {query}") 
    for ctx_bucket in search_results["facets"]["context"]["buckets"]:
        print(f"  Context: {ctx_bucket['val']}  {ctx_bucket['context_relatedness']['relatedness']}")
        print("    Keywords: ")
        for kw_bucket in ctx_bucket["keywords"]["buckets"]:
            print(f"      {kw_bucket['val']}  {kw_bucket['keywords_relatedness']['relatedness']}")
        print ("\n")

## Listing 6.3

In [5]:
run_disambiguation_query("server")
run_disambiguation_query("driver", context_limit=2)
run_disambiguation_query("chef", context_limit=2)

Query: server
  Context: devops  0.84145
    Keywords: 
      server  0.93833
      servers  0.77346
      docker  0.76534
      code  0.73233
      deploy  0.71189
      nginx  0.71097
      configuration  0.70761
      jenkins  0.70575
      git  0.69694
      ssh  0.69157


  Context: outdoors  -0.07938
    Keywords: 
      server  0.35588
      can  0.03435
      you  0.03071
      with  0.0268
      it  0.0203
      have  0.01979
      of  0.01463
      in  0.01434
      and  0.013
      to  0.01132


  Context: travel  -0.15243
    Keywords: 
      server  0.81622
      tipping  0.53867
      vpn  0.46088
      tip  0.4014
      servers  0.39731
      firewall  0.33683
      restaurant  0.22171
      bill  0.19194
      cash  0.18885
      tips  0.18553


  Context: cooking  -0.15249
    Keywords: 
      server  0.67144
      restaurant  0.16802
      pie  0.13122
      served  0.12279
      restaurants  0.1191
      knife  0.10357
      pieces  0.1005
      serve  0.09023
      

## Bonus Examples (not included in chapter)

In [6]:
collection="jobs"

request = {
    "params": {
        "qf": "job_description job_title",
        "fore": "{!type=$defType qf=$qf v=$q}",
        "back": "*:*",
        "defType": "edismax",
        "rows": 0,
        "echoParams": "none",
        "omitHeader": "true"
    },
    "query": "\"spark\"",
    "facet": {
        "job_description_keywords": {
            "type": "terms",
            "field": "job_description",
            "sort": { "relatedness": "desc"},
            "facet": {
                "relatedness": {
                    "type": "func",
                    "func": "relatedness($fore,$back)"
                }
            }            
        }
    }
}

search_results = engine.search(collection, request).json()

for bucket in search_results["facets"]["job_description_keywords"]["buckets"]:
    print(str(bucket["val"]) + "  " + str(bucket["relatedness"]["relatedness"]))    

spark  0.80665
hadoop  0.59424
hive  0.52983
kafka  0.51552
impala  0.45309
streamsets  0.39341
scala  0.38564
flume  0.38401
attunity  0.37374
mapreduce  0.36195


In [7]:
collection="jobs"

request = {
    "params": {
        "qf": "job_description job_title",
        "fore": "{!type=$defType qf=$qf v=$q}",
        "back": "*:*",
        "defType": "edismax",
        "rows": 0,
        "echoParams": "none",
        "omitHeader": "true"
    },
    "query": "\"chef\"",
    "facet": {
        "job_description_keywords": {
            "type": "terms",
            "field": "job_description",
            "sort": { "relatedness": "desc"},
            "facet": {
                "relatedness": {
                    "type": "func",
                    "func": "relatedness($fore,$back)",
                    "min_popularity": 0.0005
                }
            }            
        }
    }
}

search_results = engine.search(collection, request).json()
for bucket in search_results["facets"]["job_description_keywords"]["buckets"]:
    print(f'{bucket["val"]}  {bucket["relatedness"]["relatedness"]}')

chef  0.80689
puppet  0.59501
ansible  0.52824
terraform  0.3866
jenkins  0.30455
culinary  0.25935
docker  0.25145
cd  0.2434
ci  0.23938
ruby  0.20856


## Success!

You've leveraged a semantic knowledge graph to find related terms for a query, performed query expansion based upon semantically-similar terms, explored multiple different way to impact precision and recall of queries through integrating semantically-augmented queries, generated content-based recommendations leveraging a semantic knowledge graph, explored arbitrary relationship types by traversing a semantic knowledge graph, and performed both query classification and query disambiguration using a semantic knowledge graph.

Semantic knowledge graphs can be a powerful tool for understaning user intent and interpreting both queries and content based upon meaning instead of just text kewords.

Up next: [Related Keyword Detection from Signals](../ch06/2.related-keywords-from-signals.ipynb)