# Working with Semantic Knowledge Graphs

In [None]:
import sys
sys.path.append('..')
from aips import *
import os
import json
from IPython.core.display import display,HTML
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("aips-ch5").getOrCreate()

## Query Classification

In [2]:
collection = "stackexchange"

## Listing 6.1

In [3]:
def run_query_classification(query,keywords_field="body",classification_field="category",classification_limit=5,min_occurrences=5):      
    
    classification_query = {
        "params": {
            "qf": keywords_field,
            "fore": "{!type=$defType qf=$qf v=$q}",
            "back": "*:*",
            "defType": "edismax",
            "rows": 0,
            "echoParams": "none",
            "omitHeader": "true"
        },
        "query": query,
        "facet": {
            "classification":{
                "type": "terms",
                "field": classification_field,
                "sort": { "classification_relatedness": "desc"},
                "mincount": min_occurrences, 
                "limit": classification_limit,
                "facet": {
                    "classification_relatedness": {
                        "type": "func",
                        "func": "relatedness($fore,$back)"
                    }
                }
            }
        }
    }
     
    search_results = requests.post(f"{SOLR_URL}/{collection}/select", json=classification_query).json()  
    
    print("Query: " + query) 
    print("  Classifications: ")
    for classification_bucket in search_results["facets"]["classification"]["buckets"]:
        print("    " + str(classification_bucket["val"]) + "  " + str(classification_bucket["classification_relatedness"]["relatedness"]))
    print("\n")

run_query_classification( query="docker", classification_field="category", classification_limit=3 )
run_query_classification( query="airplane", classification_field="category", classification_limit=1 )
run_query_classification( query="airplane AND crash", classification_field="category", classification_limit=2 )
run_query_classification( query="camping", classification_field="category", classification_limit=2 )
run_query_classification( query="alien", classification_field="category", classification_limit=1 )
run_query_classification( query="passport", classification_field="category", classification_limit=1 )
run_query_classification( query="driver", classification_field="category", classification_limit=2 )
run_query_classification( query="driver AND taxi", classification_field="category", classification_limit=2 )
run_query_classification( query="driver AND install", classification_field="category", classification_limit=2 )


Query: docker
  Classifications: 


Query: airplane
  Classifications: 
    devops  0.20632


Query: airplane AND crash
  Classifications: 
    scifi  0.01927
    devops  0.00426


Query: camping
  Classifications: 
    outdoors  0.70998
    devops  0.00609


Query: alien
  Classifications: 
    scifi  0.69532


Query: passport
  Classifications: 
    devops  0.72612


Query: driver
  Classifications: 
    devops  0.27655
    travel  0.27655


Query: driver AND taxi
  Classifications: 
    devops  0.15285
    travel  0.15285


Query: driver AND install
  Classifications: 
    devops  0.02295
    travel  0.02295




## Disambiguation

## Listing 6.2

In [4]:
def run_disambiguation_query(query,keywords_field="body",context_field="category",keywords_limit=10,context_limit=5,min_occurrences=5):      
    
    disambiguation_query = {
        "params": {
            "qf": keywords_field,
            "fore": "{!type=$defType qf=$qf v=$q}",
            "back": "*:*",
            "defType": "edismax",
            "rows": 0,
            "echoParams": "none",
            "omitHeader": "true"
        },
        "query": query,
        "facet": {
            "context":{
                "type": "terms",
                "field": context_field,
                "sort": { "context_relatedness": "desc"},
                "mincount": min_occurrences, 
                "limit": context_limit,
                "facet": {
                    "context_relatedness": {
                        "type": "func",
                        "func": "relatedness($fore,$back)"
                    },        
                    "keywords": {
                        "type": "terms",
                        "field": keywords_field,
                        "mincount": min_occurrences,
                        "limit": keywords_limit,
                        "sort": { "keywords_relatedness": "desc"},
                        "facet": {
                            "keywords_relatedness": {
                                "type": "func",
                                "func": "relatedness($fore,$back)"
                            }
                        }
                    }
                }
            }
        }
    }
        
    search_results = requests.post(f"{SOLR_URL}/{collection}/select", json=disambiguation_query).json()  
    
    print("Query: " + query) 
    for context_bucket in search_results["facets"]["context"]["buckets"]:
        print("  Context: " + str(context_bucket["val"]) + "  " + str(context_bucket["context_relatedness"]["relatedness"]))
        print("    Keywords: ")
        for keywords_bucket in context_bucket["keywords"]["buckets"]:
            print("      " + str(keywords_bucket["val"]) + "  " + str(keywords_bucket["keywords_relatedness"]["relatedness"]))
        print ("\n")

## Listing 6.3

In [5]:
run_disambiguation_query( query="server", context_field="category", keywords_field="body" )
run_disambiguation_query( query="driver", context_field="category", keywords_field="body", context_limit=2 )
run_disambiguation_query( query="chef", context_field="category", keywords_field="body", context_limit=2 )


Query: server
  Context: devops  0.05109
    Keywords: 
      server  0.91319
      servers  0.60808
      tipping  0.48646
      vpn  0.45332
      firewall  0.42315
      tip  0.38742
      proxy  0.21428
      ip  0.21088
      encrypted  0.20859
      restaurant  0.20722


  Context: travel  0.05109
    Keywords: 
      server  0.91319
      servers  0.60808
      tipping  0.48646
      vpn  0.45332
      firewall  0.42315
      tip  0.38742
      proxy  0.21428
      ip  0.21088
      encrypted  0.20859
      restaurant  0.20722


  Context: outdoors  -0.02515
    Keywords: 
      server  0.62582
      can  0.0333
      you  0.02916
      with  0.02723
      it  0.02099
      have  0.01901
      of  0.01526
      in  0.01431
      and  0.01299
      to  0.01102


  Context: cooking  -0.03302
    Keywords: 
      server  0.84833
      restaurant  0.15811
      pie  0.14431
      served  0.12455
      knife  0.11227
      pieces  0.10799
      restaurants  0.10526
      dish  0.0957

## Bonus Examples (not included in chapter)

In [6]:
collection="jobs"

request = {
    "params": {
        "qf": "job_description job_title",
        "fore": "{!type=$defType qf=$qf v=$q}",
        "back": "*:*",
        "defType": "edismax",
        "rows": 0,
        "echoParams": "none",
        "omitHeader": "true"
    },
    "query": "\"spark\"",
    "facet": {
        "job_description_keywords": {
            "type": "terms",
            "field": "job_description",
            "sort": { "relatedness": "desc"},
            "facet": {
                "relatedness": {
                    "type": "func",
                    "func": "relatedness($fore,$back)"
                }
            }            
        }
    }
}

#search_results = json.dumps(requests.post(solr_url + collection + "/select", json=request).json(), indent=2)
search_results = requests.post(f"{SOLR_URL}/{collection}/select", json=request).json()

for bucket in search_results["facets"]["job_description_keywords"]["buckets"]:
  print(str(bucket["val"]) + "  " + str(bucket["relatedness"]["relatedness"]))    

spark  0.80665
hadoop  0.59424
hive  0.52983
kafka  0.51552
impala  0.45309
streamsets  0.39341
scala  0.38564
flume  0.38401
attunity  0.37374
mapreduce  0.36195


In [7]:
collection="jobs"

request = {
    "params": {
        "qf": "job_description job_title",
        "fore": "{!type=$defType qf=$qf v=$q}",
        "back": "*:*",
        "defType": "edismax",
        "rows": 0,
        "echoParams": "none",
        "omitHeader": "true"
    },
    "query": "\"chef\"",
    "facet": {
        "job_description_keywords": {
            "type": "terms",
            "field": "job_description",
            "sort": { "relatedness": "desc"},
            "facet": {
                "relatedness": {
                    "type": "func",
                    "func": "relatedness($fore,$back)",
                    "min_popularity": 0.0005
                }
            }            
        }
    }
}

#search_results = json.dumps(requests.post(solr_url + collection + "/select", json=request).json(), indent=2)
search_results = requests.post(f"{SOLR_URL}/{collection}/select", json=request).json()

for bucket in search_results["facets"]["job_description_keywords"]["buckets"]:
  print(str(bucket["val"]) + "  " + str(bucket["relatedness"]["relatedness"]))    

chef  0.80689
puppet  0.59501
ansible  0.52824
terraform  0.3866
jenkins  0.30455
culinary  0.25935
docker  0.25145
cd  0.2434
ci  0.23938
ruby  0.20856


## Success!

You've leveraged a semantic knowledge graph to find related terms for a query, performed query expansion based upon semantically-similar terms, explored multiple different way to impact precision and recall of queries through integrating semantically-augmented queries, generated content-based recommendations leveraging a semantic knowledge graph, explored arbitrary relationship types by traversing a semantic knowledge graph, and performed both query classification and query disambiguration using a semantic knowledge graph.

Semantic knowledge graphs can be a powerful tool for understaning user intent and interpreting both queries and content based upon meaning instead of just text kewords.

Up next: Chapter 10 - [Learning to Rank](../ch10/1.ch10-setup-TheMovieDB.ipynb)