# Working with Semantic Knowledge Graphs

In [4]:
import sys
sys.path.append('..')
from aips import *
import os
import json
from IPython.core.display import display,HTML
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("aips-ch5").getOrCreate()

## Listing 5.4

In [5]:
collection="health"
query="advil"

request = {
    "params": {
        "qf": "title body",
        "fore": "{!type=$defType qf=$qf v=$q}",
        "back": "*:*",
        "defType": "edismax",
        "rows": 0,
        "echoParams": "none",
        "omitHeader": "true"
    },
    "query": query,
    "facet": {
        "body": {
            "type": "terms",
            "field": "body",
            "sort": { "relatedness": "desc"},
            "mincount": 2,
            "limit": 8,
            "facet": {
                "relatedness": {
                    "type": "func",
                    "func": "relatedness($fore,$back)"
                    #"min_popularity": 0.0005
                }
            }            
        }
    }
}

search_results = requests.post(f"{solr_url}{collection}/select", json=request).json()

for bucket in search_results["facets"]["body"]["buckets"]:
  print(f'{bucket["val"]}  {bucket["relatedness"]["relatedness"]}')

advil  0.70986
motrin  0.59897
aleve  0.4662
ibuprofen  0.38264
alleve  0.36649
tylenol  0.33048
naproxen  0.31226
acetaminophen  0.17706


## Listing 5.5

In [3]:
query = "vibranium"

collection="stackexchange"

request = {
    "query": query,
    "params": {
        "qf": "title body",
        "fore": "{!type=$defType qf=$qf v=$q}",
        "back": "*:*",
        "defType": "edismax",
        "rows": 0,
        "echoParams": "none",
        "omitHeader": "true"
    },
    "facet": {
        "body": {
            "type": "terms",
            "field": "body",
            "sort": { "relatedness": "desc"},
            "mincount": 2,
            "limit": 8,
            "facet": {
                "relatedness": {
                    "type": "func",
                    "func": "relatedness($fore,$back)"
                    #"min_popularity": 0.0005
                }
            }            
        }
    }
}

search_results = requests.post(f"{solr_url}{collection}/select", json=request).json()

for bucket in search_results["facets"]["body"]["buckets"]:
  print(f'{bucket["val"]}  {bucket["relatedness"]["relatedness"]}')

vibranium  0.94921
wakandan  0.84076
adamantium  0.82988
wakanda  0.81583
maclain  0.78532
klaw  0.78185
alloy  0.77749
america's  0.76702


## Listing 5.6

## Query Augmentation

In [4]:
query_expansion = ""

terms = search_results["facets"]["body"]["buckets"]
for bucket in search_results["facets"]["body"]["buckets"]:
  term = bucket["val"]
  boost = bucket["relatedness"]["relatedness"]
  if len(query_expansion) > 0: 
    query_expansion += " "
  query_expansion += f"{term}^{boost}"

expanded_query = f"{query}^5 {query_expansion}"

print(f"Expanded Query:\n{expanded_query}")

Expanded Query:
vibranium^5 vibranium^0.94921 wakandan^0.84076 adamantium^0.82988 wakanda^0.81583 maclain^0.78532 klaw^0.78185 alloy^0.77749 america's^0.76702


## Listing 5.7

In [5]:
simple_expansion = 'q={!edismax qf="title body" mm="0%"}' + query + " " + query_expansion
increase_conceptual_precision = 'q={!edismax qf="title body" mm="30%"}' + query + " " + query_expansion
increase_precision_reduce_recall = 'q={!edismax qf="title body" mm="2"}' + query + " AND " + query_expansion
slightly_increased_recall = 'q={!edismax qf="title body" mm="2"}' + query + " " + query_expansion
same_results_better_ranking = 'q={!edismax qf="title body" mm="2"}' + query \
  + "&boost=" + "query($expanded_query)&expanded_query=" + query_expansion


print(f"""Simple Query Expansion:
{simple_expansion}
Increased Precision, Reduced Recall Query:
{increase_conceptual_precision}
Increased Precision, No Reduction in Recall:
{increase_precision_reduce_recall}
Slightly Increased Recall Query:
{slightly_increased_recall}
Same Results Better Conceptual Ranking:
{same_results_better_ranking}""")

#TODO, actually run the search

Simple Query Expansion:
q={!edismax qf="title body" mm="0%"}vibranium vibranium^0.94921 wakandan^0.84076 adamantium^0.82988 wakanda^0.81583 maclain^0.78532 klaw^0.78185 alloy^0.77749 america's^0.76702
Increased Precision, Reduced Recall Query:
q={!edismax qf="title body" mm="30%"}vibranium vibranium^0.94921 wakandan^0.84076 adamantium^0.82988 wakanda^0.81583 maclain^0.78532 klaw^0.78185 alloy^0.77749 america's^0.76702
Increased Precision, No Reduction in Recall:
q={!edismax qf="title body" mm="2"}vibranium AND vibranium^0.94921 wakandan^0.84076 adamantium^0.82988 wakanda^0.81583 maclain^0.78532 klaw^0.78185 alloy^0.77749 america's^0.76702
Slightly Increased Recall Query:
q={!edismax qf="title body" mm="2"}vibranium vibranium^0.94921 wakandan^0.84076 adamantium^0.82988 wakanda^0.81583 maclain^0.78532 klaw^0.78185 alloy^0.77749 america's^0.76702
Same Results Better Conceptual Ranking:
q={!edismax qf="title body" mm="2"}vibranium&boost=query($expanded_query)&expanded_query=vibranium^0.949

## Content-based Recommendations

In [6]:
request_template = {
    "params": {
        "qf": "body",
        "fore": "{!type=$defType qf=$qf v=$q}",
        "back": "*:*",
        "relatedness_func": "relatedness($fore,$back)",
        "defType": "edismax",
        "rows": 0,
        "echoParams": "none",
        "omitHeader": "true",
        "mm": "100%"
    },
    "facet": { }
}

## Listing 5.8

In [7]:
import collections
from mergedeep import merge

print(solr_url)
collection="stackexchange"
classification="star wars"

document="""this doc contains the words luke, magneto, cyclops, darth vader, 
           princess leia, wolverine, apple, banana, galaxy, force, blaster, 
           and chloe."""

#run an entity extractor to parse out keywords to score
parsed_document = ["this", "doc", "contains", "the", "words", "luke", 
                   "magneto", "cyclops", "darth vader", "princess leia",
                   "wolverine", "apple", "banana", "galaxy", "force",
                   "blaster", "and", "chloe"]

request = {"query": classification, "params": {}, "facet": {}}

i=0
for term in parsed_document:
    i+=1
    key = f"t{i}"
    key2 = "${" + key + "}"
    request["params"][key] = term
    request["facet"][key2] = { 
        "type": "query", 
        "q": "{!edismax qf=${qf} v=" + key2 + "}",  
        "facet": {"stats": "${relatedness_func}"}
    }


print(json.dumps(request,indent="  "))

full_request = merge(request_template, request)
search_results = requests.post(f"{solr_url}{collection}/select", json=full_request).json()

def parse_scores(search_results):
    results = collections.OrderedDict()
    for key in search_results["facets"]:
        if key != "count" and key != "" and "stats" in search_results["facets"][key]:
            relatedness = search_results["facets"][key]["stats"]["relatedness"]
            results[key] = relatedness
    return list(reversed(sorted(results.items(), key=lambda kv: kv[1])))

scored_terms = parse_scores(search_results)

for scored_term in scored_terms:
    print (scored_term)

http://aips-solr:8983/solr/
{
  "query": "star wars",
  "params": {
    "t1": "this",
    "t2": "doc",
    "t3": "contains",
    "t4": "the",
    "t5": "words",
    "t6": "luke",
    "t7": "magneto",
    "t8": "cyclops",
    "t9": "darth vader",
    "t10": "princess leia",
    "t11": "wolverine",
    "t12": "apple",
    "t13": "banana",
    "t14": "galaxy",
    "t15": "force",
    "t16": "blaster",
    "t17": "and",
    "t18": "chloe"
  },
  "facet": {
    "${t1}": {
      "type": "query",
      "q": "{!edismax qf=${qf} v=${t1}}",
      "facet": {
        "stats": "${relatedness_func}"
      }
    },
    "${t2}": {
      "type": "query",
      "q": "{!edismax qf=${qf} v=${t2}}",
      "facet": {
        "stats": "${relatedness_func}"
      }
    },
    "${t3}": {
      "type": "query",
      "q": "{!edismax qf=${qf} v=${t3}}",
      "facet": {
        "stats": "${relatedness_func}"
      }
    },
    "${t4}": {
      "type": "query",
      "q": "{!edismax qf=${qf} v=${t4}}",
      "fac

## Listing 5.9

In [8]:
rec_query = ""

for scored_term in scored_terms:
  term = scored_term[0]
  boost = scored_term[1]
  if len(rec_query) > 0: 
    rec_query += " "
  if boost > 0.25:
    rec_query += f"{term}^{boost}"

print(f"Expanded Query:\n{rec_query}")

Expanded Query:
luke^0.78478 darth vader^0.77255 force^0.7603 galaxy^0.65179 blaster^0.51862 princess leia^0.38976           


## Listing 5.10

In [9]:
import collections

collection="stackexchange"

request = {
    "params": {
        "qf": "title body",
        "defType": "edismax",
        "rows": 5,
        "echoParams": "none",
        "omitHeader": "true",
        "mm": "0",
        "fl": "title",
        "fq": "title:[* TO *]" #only show docs with titles to make the example readable
    },
    "query": rec_query
}

search_results = requests.post(f"{solr_url}{collection}/select", json=request).json()
print(json.dumps(search_results, indent="  "))

{
  "response": {
    "numFound": 5889,
    "start": 0,
    "docs": [
      {
        "title": "What was the deal Lando made with Darth Vader?"
      },
      {
        "title": "At the end of Return of the Jedi, did Darth Vader learn that Princess Leia was his daughter?"
      },
      {
        "title": "Why was Vader not able to read Leia's thoughts like he read Luke's thoughts?"
      },
      {
        "title": "Did Darth Vader know that this character was his daughter from the start?"
      },
      {
        "title": "Why couldn't Snoke or Kylo Ren trace Luke using the Force?"
      }
    ]
  }
}


## Exploring Arbitrary Relationships

## Listing 5.11

In [10]:
collection = "scifi"

starting_node = '"jean grey"'
relationship = '"in love with"'

request = {
    "query": starting_node,
    "params": {
        "qf": "body",
        "fore": "{!type=$defType qf=$qf v=$q}",
        "back": "*:*",
        "defType": "edismax",
        "rows": 0,
        "echoParams": "none",
        "omitHeader": "true"
    },
    "facet": {
        "in_love_with":{
            "type": "query",
            "query": "{!edismax qf=body v=$relationship}",
            "facet": {
                "terminating_nodes": {
                    "type": "terms",
                    "field": "body",
                    "mincount": 25,
                    "limit": 9,
                    "sort": { "body_relatedness": "desc"},
                    "facet": {
                        "body_relatedness": {
                            "type": "func",
                            "func": "relatedness($fore,$back)"
                        }
                    }
                }
            }
        }
    }
}

search_results = requests.post(f"{solr_url}{collection}/select", json=request).json()

for bucket in search_results["facets"]["in_love_with"]["terminating_nodes"]["buckets"]:
  print(f'{bucket["val"]}  {bucket["body_relatedness"]["relatedness"]}')

jean  0.85044
grey  0.74965
cyclops  0.61313
summers  0.60624
xavier  0.54697
wolverine  0.49361
x  0.46596
mutant  0.46248
magneto  0.43692


## Query Classification

## Listing 5.12

In [11]:
collection="stackexchange"
def run_query_classification(query, keywords_field="body", classification_field="category",
                             classification_limit=5, min_occurrences=5):
    classification_query = {
        "params": {
            "qf": keywords_field,
            "fore": "{!type=$defType qf=$qf v=$q}",
            "back": "*:*",
            "defType": "edismax",
            "rows": 0,
            "echoParams": "none",
            "omitHeader": "true"
        },
        "query": query,
        "facet": {
            "classification":{
                "type": "terms",
                "field": classification_field,
                "sort": { "classification_relatedness": "desc"},
                "mincount": min_occurrences, 
                "limit": classification_limit,
                "facet": {
                    "classification_relatedness": {
                        "type": "func",
                        "func": "relatedness($fore,$back)"
                    }
                }
            }
        }
    }
        
    search_results = requests.post(f"{solr_url}{collection}/select", json=classification_query).json()  
    
    print("Query: " + query) 
    print("  Classifications: ")
    for classification_bucket in search_results["facets"]["classification"]["buckets"]:
        print(f'    {classification_bucket["val"]}  {classification_bucket["classification_relatedness"]["relatedness"]}')
    print("\n")

run_query_classification( query="docker", classification_field="category", classification_limit=3 )
run_query_classification( query="airplane", classification_field="category", classification_limit=1 )
run_query_classification( query="airplane AND crash", classification_field="category", classification_limit=2 )
run_query_classification( query="camping", classification_field="category", classification_limit=2 )
run_query_classification( query="alien", classification_field="category", classification_limit=1 )
run_query_classification( query="passport", classification_field="category", classification_limit=1 )
run_query_classification( query="driver", classification_field="category", classification_limit=2 )
run_query_classification( query="driver AND taxi", classification_field="category", classification_limit=2 )
run_query_classification( query="driver AND install", classification_field="category", classification_limit=2 )


Query: docker
  Classifications: 


Query: airplane
  Classifications: 
    devops  0.20632


Query: airplane AND crash
  Classifications: 
    scifi  0.01927
    devops  0.00426


Query: camping
  Classifications: 
    outdoors  0.70998
    devops  0.00609


Query: alien
  Classifications: 
    scifi  0.69532


Query: passport
  Classifications: 
    devops  0.72612


Query: driver
  Classifications: 
    devops  0.27655
    travel  0.27655


Query: driver AND taxi
  Classifications: 
    devops  0.15285
    travel  0.15285


Query: driver AND install
  Classifications: 
    devops  0.02295
    travel  0.02295




## Disambiguation

## Listing 5.13

In [12]:
collection="stackexchange"
def run_disambiguation_query(query, keywords_field="body", context_field="category",
                             keywords_limit=10, context_limit=5, min_occurrences=5):   
    disambiguation_query = {
        "params": {
            "qf": keywords_field,
            "fore": "{!type=$defType qf=$qf v=$q}",
            "back": "*:*",
            "defType": "edismax",
            "rows": 0,
            "echoParams": "none",
            "omitHeader": "true"
        },
        "query": query,
        "facet": {
            "context":{
                "type": "terms",
                "field": context_field,
                "sort": { "context_relatedness": "desc"},
                "mincount": min_occurrences, 
                "limit": context_limit,
                "facet": {
                    "context_relatedness": {
                        "type": "func",
                        "func": "relatedness($fore,$back)"
                    },        
                    "keywords": {
                        "type": "terms",
                        "field": keywords_field,
                        "mincount": min_occurrences,
                        "limit": keywords_limit,
                        "sort": { "keywords_relatedness": "desc"},
                        "facet": {
                            "keywords_relatedness": {
                                "type": "func",
                                "func": "relatedness($fore,$back)"
                            }
                        }
                    }
                }
            }
        }
    }
    
    search_results = requests.post(f"{solr_url}{collection}/select", json=disambiguation_query).json()  
    
    print(f"Query: {query}") 
    for context_bucket in search_results["facets"]["context"]["buckets"]:
        print(f'  Context: {context_bucket["val"]}  {context_bucket["context_relatedness"]["relatedness"]}')
        print("    Keywords: ")
        for keywords_bucket in context_bucket["keywords"]["buckets"]:
            print(f'      {keywords_bucket["val"]}  {keywords_bucket["keywords_relatedness"]["relatedness"]}')
        print ("\n")

## Listing 5.14

In [13]:
run_disambiguation_query( query="server", context_field="category", keywords_field="body" )
run_disambiguation_query( query="driver", context_field="category", keywords_field="body", context_limit=2 )
run_disambiguation_query( query="chef", context_field="category", keywords_field="body", context_limit=2 )

Query: server
  Context: devops  0.05109
    Keywords: 
      server  0.91319
      servers  0.60808
      tipping  0.48646
      vpn  0.45332
      firewall  0.42315
      tip  0.38742
      proxy  0.21428
      ip  0.21088
      encrypted  0.20859
      restaurant  0.20722


  Context: travel  0.05109
    Keywords: 
      server  0.91319
      servers  0.60808
      tipping  0.48646
      vpn  0.45332
      firewall  0.42315
      tip  0.38742
      proxy  0.21428
      ip  0.21088
      encrypted  0.20859
      restaurant  0.20722


  Context: outdoors  -0.02515
    Keywords: 
      server  0.62582
      can  0.0333
      you  0.02916
      with  0.02723
      it  0.02099
      have  0.01901
      of  0.01526
      in  0.01431
      and  0.01299
      to  0.01102


  Context: cooking  -0.03302
    Keywords: 
      server  0.84833
      restaurant  0.15811
      pie  0.14431
      served  0.12455
      knife  0.11227
      pieces  0.10799
      restaurants  0.10526
      dish  0.0957

## Bonus Examples (not included in chapter)

In [14]:
collection="jobs"

request = {
    "params": {
        "qf": "job_description job_title",
        "fore": "{!type=$defType qf=$qf v=$q}",
        "back": "*:*",
        "defType": "edismax",
        "rows": 0,
        "echoParams": "none",
        "omitHeader": "true"
    },
    "query": "\"spark\"",
    "facet": {
        "job_description_keywords": {
            "type": "terms",
            "field": "job_description",
            "sort": { "relatedness": "desc"},
            "facet": {
                "relatedness": {
                    "type": "func",
                    "func": "relatedness($fore,$back)"
                }
            }            
        }
    }
}

#search_results = json.dumps(requests.post(solr_url + collection + "/select", json=request).json(), indent=2)
search_results = requests.post(f"{solr_url}{collection}/select", json=request).json()

for bucket in search_results["facets"]["job_description_keywords"]["buckets"]:
  print(f'{bucket["val"]}  {bucket["relatedness"]["relatedness"]}')

spark  0.80665
hadoop  0.59424
hive  0.52983
kafka  0.51552
impala  0.45309
streamsets  0.39341
scala  0.38564
flume  0.38401
attunity  0.37374
mapreduce  0.36195


In [15]:
collection="jobs"

request = {
    "params": {
        "qf": "job_description job_title",
        "fore": "{!type=$defType qf=$qf v=$q}",
        "back": "*:*",
        "defType": "edismax",
        "rows": 0,
        "echoParams": "none",
        "omitHeader": "true"
    },
    "query": "\"chef\"",
    "facet": {
        "job_description_keywords": {
            "type": "terms",
            "field": "job_description",
            "sort": { "relatedness": "desc"},
            "facet": {
                "relatedness": {
                    "type": "func",
                    "func": "relatedness($fore,$back)",
                    "min_popularity": 0.0005
                }
            }            
        }
    }
}

search_results = requests.post(f"{solr_url}{collection}/select", json=request).json()

for bucket in search_results["facets"]["job_description_keywords"]["buckets"]:
  print(f'{bucket["val"]}  {bucket["relatedness"]["relatedness"]}')

chef  0.80689
puppet  0.59501
ansible  0.52824
terraform  0.3866
jenkins  0.30455
culinary  0.25935
docker  0.25145
cd  0.2434
ci  0.23938
ruby  0.20856


## Success!

You've leveraged a semantic knowledge graph to find related terms for a query, performed query expansion based upon semantically-similar terms, explored multiple different way to impact precision and recall of queries through integrating semantically-augmented queries, generated content-based recommendations leveraging a semantic knowledge graph, explored arbitrary relationship types by traversing a semantic knowledge graph, and performed both query classification and query disambiguration using a semantic knowledge graph.

Semantic knowledge graphs can be a powerful tool for understaning user intent and interpreting both queries and content based upon meaning instead of just text kewords.

Up next: Chapter 10 - [Learning to Rank](../ch10/1.ch10-setup-TheMovieDB.ipynb)