# [ Chapter 6 - Using Content to Learn Domain-specific Language ]
# Query Classification and Disambiguation with Semantic Knowledge Graphs

NOTE: This notebook depends upon the the Stack Exchange datasets. If you have any issues, please rerun the [Setting up the Stack Exchange Dataset](../ch05/2.index-datasets.ipynb) notebook.

In [9]:
import sys
sys.path.append('..')
from aips import *
import os
import json
from IPython.display import display,HTML
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("AIPS").getOrCreate()
engine = get_engine()

## Query Classification

In [10]:
stackexchange_collection = engine.get_collection("stackexchange")

## Listing 6.1

In [11]:
def run_query_classification(query, classification_field="category", 
                             classification_limit=5, keywords_field="body", min_occurrences=5):
        
    classification_query = {
        "params": {
            "qf": keywords_field,
            "fore": "{!type=$defType qf=$qf v=$q}",
            "back": "*:*",
            "defType": "edismax"
        },
        "query": query,
        "facet": {
            "classification":{
                "type": "terms",
                "field": classification_field,
                "sort": { "classification_relatedness": "desc"},
                "mincount": min_occurrences, 
                "limit": classification_limit,
                "facet": {
                    "classification_relatedness": {
                        "type": "func",
                        "func": "relatedness($fore,$back)"
                    }
                }
            }
        }
    }
     
    response = stackexchange_collection.search(classification_query)
    print(f"Query: {query}") 
    print("  Classifications: ")
    for bucket in response["facets"]["classification"]["buckets"]:
        print(f"    {bucket['val']}  {bucket['classification_relatedness']['relatedness']}")
    print("\n")

run_query_classification("docker", classification_limit=3 )
run_query_classification("airplane", classification_limit=1 )
run_query_classification("airplane AND crash", classification_limit=2 )
run_query_classification("vitamins", classification_limit=2 )
run_query_classification("alien", classification_limit=1 )
run_query_classification("passport", classification_limit=1 )
run_query_classification("driver", classification_limit=2 )
run_query_classification("driver AND taxi", classification_limit=2 )
run_query_classification("driver AND install", classification_limit=2 )

Query: docker
  Classifications: 
    devops  0.87978


Query: airplane
  Classifications: 
    travel  0.33334


Query: airplane AND crash
  Classifications: 
    scifi  0.02149
    travel  0.00475


Query: vitamins
  Classifications: 
    health  0.48681
    cooking  0.09441


Query: alien
  Classifications: 
    scifi  0.62541


Query: passport
  Classifications: 
    travel  0.82883


Query: driver
  Classifications: 
    travel  0.38996
    devops  0.08917


Query: driver AND taxi
  Classifications: 
    travel  0.24184
    scifi  -0.13757


Query: driver AND install
  Classifications: 
    devops  0.22277
    travel  -0.00675




## Disambiguation

## Listing 6.2

In [12]:
def run_disambiguation_query(query, context_field="category", context_limit=5,
                             keywords_field="body", keywords_limit=10, min_occurrences=5):      
    
    disambiguation_query = {
        "params": {
            "qf": keywords_field,
            "fore": "{!type=$defType qf=$qf v=$q}",
            "back": "*:*",
            "defType": "edismax",
            "rows": 0,
            "echoParams": "none",
            "omitHeader": "true"
        },
        "query": query,
        "facet": {
            "context":{
                "type": "terms",
                "field": context_field,
                "sort": { "context_relatedness": "desc"},
                "mincount": min_occurrences, 
                "limit": context_limit,
                "facet": {
                    "context_relatedness": {
                        "type": "func",
                        "func": "relatedness($fore,$back)"
                    },        
                    "keywords": {
                        "type": "terms",
                        "field": keywords_field,
                        "mincount": min_occurrences,
                        "limit": keywords_limit,
                        "sort": { "keywords_relatedness": "desc"},
                        "facet": {
                            "keywords_relatedness": {
                                "type": "func",
                                "func": "relatedness($fore,$back)"
                            }
                        }
                    }
                }
            }
        }
    }    
        
    response = stackexchange_collection.search(disambiguation_query)    
    print(f"Query: {query}") 
    for ctx_bucket in response["facets"]["context"]["buckets"]:
        print(f"  Context: {ctx_bucket['val']}  {ctx_bucket['context_relatedness']['relatedness']}")
        print("    Keywords: ")
        for kw_bucket in ctx_bucket["keywords"]["buckets"]:
            print(f"      {kw_bucket['val']}  {kw_bucket['keywords_relatedness']['relatedness']}")
        print ("\n")

## Listing 6.3

In [13]:
run_disambiguation_query("server")
run_disambiguation_query("driver", context_limit=2)
run_disambiguation_query("chef", context_limit=2)

Query: server
  Context: devops  0.83796
    Keywords: 
      server  0.93698
      servers  0.76818
      docker  0.75955
      code  0.72832
      configuration  0.70686
      deploy  0.70634
      nginx  0.70366
      jenkins  0.69934
      git  0.68932
      ssh  0.6836


  Context: cooking  -0.1574
    Keywords: 
      server  0.66363
      restaurant  0.16482
      pie  0.12882
      served  0.12098
      restaurants  0.11679
      knife  0.10788
      pieces  0.10135
      serve  0.08934
      staff  0.0886
      dish  0.08553


  Context: travel  -0.15959
    Keywords: 
      server  0.81226
      tipping  0.54391
      vpn  0.45352
      tip  0.41117
      servers  0.39053
      firewall  0.33092
      restaurant  0.21698
      tips  0.19524
      bill  0.18951
      cash  0.18485


  Context: scifi  -0.28208
    Keywords: 
      server  0.78173
      flynn's  0.53341
      computer  0.28075
      computers  0.2593
      flynn  0.24963
      servers  0.24778
      grid  0.2388

## Bonus Examples: Job Skills (not included in chapter)

In [14]:
jobs_collection=engine.get_collection("jobs")

In [15]:
def print_related_job_terms(query):
    request = {
        "params": {
            "qf": "job_description job_title",
            "fore": "{!type=$defType qf=$qf v=$q}",
            "back": "*:*",
            "defType": "edismax",
            "rows": 0,
            "echoParams": "none",
            "omitHeader": "true"
        },
        "query": f"\"{query}\"",
        "facet": {
            "job_description_keywords": {
                "type": "terms",
                "field": "job_description",
                "sort": { "relatedness": "desc"},
                "facet": {
                    "relatedness": {
                        "type": "func",
                        "func": "relatedness($fore,$back)",
                        "min_popularity": 0.0005
                    }
                }            
            }
        }
    }

    response = jobs_collection.search(request)

    print(f"{query}\n------------")
    for bucket in response["facets"]["job_description_keywords"]["buckets"]:
        print(str(bucket["val"]) + "  " + str(bucket["relatedness"]["relatedness"]))
    print("\n")

In [16]:
print_related_job_terms("spark")
print_related_job_terms("registered nurse")
print_related_job_terms("docker")
print_related_job_terms("personal trainer")

spark
------------
spark  0.80665
hadoop  0.59424
hive  0.52983
kafka  0.51552
impala  0.45309
streamsets  0.39341
scala  0.38564
flume  0.38401
mapreduce  0.36195
nosql  0.35116


registered nurse
------------
nurse  0.64418
rn  0.59936
registered  0.42736
nursing  0.33399
clinical  0.23799
hospital  0.20062
families  0.14216
patients  0.13847
practice  0.13157
care  0.12564


docker
------------
docker  0.80683
kubernetes  0.69719
jenkins  0.49458
container  0.49226
openshift  0.45521
aws  0.41473
microservices  0.39713
cd  0.39409
ci  0.38633
devops  0.38085


personal trainer
------------
nsca  0.74439
nasm  0.71632
acsm  0.66348
nfpt  0.66004
trainer  0.63695
issa  0.62555
aerobic  0.62328
cpr  0.53834
ace  0.50328
distant  0.46272




## Success!

You've leveraged a semantic knowledge graph to find related terms for a query, performed query expansion based upon semantically-similar terms, explored multiple different way to impact precision and recall of queries through integrating semantically-augmented queries, generated content-based recommendations leveraging a semantic knowledge graph, explored arbitrary relationship types by traversing a semantic knowledge graph, and performed both query classification and query disambiguration using a semantic knowledge graph.

Semantic knowledge graphs can be a powerful tool for understaning user intent and interpreting both queries and content based upon meaning instead of just text kewords.

Up next: [Related Keyword Detection from Signals](../ch06/2.related-keywords-from-signals.ipynb)