# Working with Semantic Knowledge Graphs

In [1]:
import sys
sys.path.append('..')
from aips import *
import os
import json
from IPython.core.display import display,HTML
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("aips-ch5").getOrCreate()

## Listing 5.4

In [2]:
collection="health"
query="advil"

request = {
    "params": {
        "qf": "title body",
        "fore": "{!type=$defType qf=$qf v=$q}",
        "back": "*:*",
        "defType": "edismax",
        "rows": 0,
        "echoParams": "none",
        "omitHeader": "true"
    },
    "query": query,
    "facet": {
        "body": {
            "type": "terms",
            "field": "body",
            "sort": { "relatedness": "desc"},
            "mincount": 2,
            "limit": 8,
            "facet": {
                "relatedness": {
                    "type": "func",
                    "func": "relatedness($fore,$back)"
                    #"min_popularity": 0.0005
                }
            }            
        }
    }
}

search_results = requests.post(solr_url + collection + "/select", json=request).json()

for bucket in search_results["facets"]["body"]["buckets"]:
  print(str(bucket["val"]) + "  " + str(bucket["relatedness"]["relatedness"]))

advil  0.70986
motrin  0.59897
aleve  0.4662
ibuprofen  0.38264
alleve  0.36649
tylenol  0.33048
naproxen  0.31226
acetaminophen  0.17706


## Listing 5.5

In [92]:
query = "vibranium"

collection="stackexchange"

request = {
    "query": query,
    "params": {
        "qf": "title body",
        "fore": "{!type=$defType qf=$qf v=$q}",
        "back": "*:*",
        "defType": "edismax",
        "rows": 0,
        "echoParams": "none",
        "omitHeader": "true"
    },
    "facet": {
        "body": {
            "type": "terms",
            "field": "body",
            "sort": { "relatedness": "desc"},
            "mincount": 2,
            "limit": 8,
            "facet": {
                "relatedness": {
                    "type": "func",
                    "func": "relatedness($fore,$back)"
                    #"min_popularity": 0.0005
                }
            }            
        }
    }
}

search_results = requests.post(solr_url + collection + "/select", json=request).json()

for bucket in search_results["facets"]["body"]["buckets"]:
  print(str(bucket["val"]) + "  " + str(bucket["relatedness"]["relatedness"]))

vibranium  0.92227
wakandan  0.7551
wakanda  0.75377
adamantium  0.74686
panther's  0.69938
klaue  0.68194
panther  0.65396
klaw  0.6532


## Listing 5.6

## Query Augmentation

In [94]:
query_expansion = ""

terms = search_results["facets"]["body"]["buckets"]
for bucket in search_results["facets"]["body"]["buckets"]:
  term = bucket["val"]
  boost = bucket["relatedness"]["relatedness"]
  if len(query_expansion) > 0: 
    query_expansion += " "
  query_expansion += " " + term + "^" + str(boost)

expanded_query = query + "^5" + query_expansion

print("Expanded Query:\n" + expanded_query)

Expanded Query:
vibranium^5 vibranium^0.92227  wakandan^0.7551  wakanda^0.75377  adamantium^0.74686  panther's^0.69938  klaue^0.68194  panther^0.65396  klaw^0.6532


## Listing 5.7

In [100]:
simple_expansion = 'q={!edismax qf="title body" mm="0%"}' + query + " " + query_expansion
increase_conceptual_precision = 'q={!edismax qf="title body" mm="30%"}' + query + " " + query_expansion
increase_precision_reduce_recall = 'q={!edismax qf="title body" mm="2"}' + query + " AND" + ( query_expansion )
slightly_increased_recall = 'q={!edismax qf="title body" mm="2"}' + query + query_expansion
same_results_better_ranking = 'q={!edismax qf="title body" mm="2"}' + query \
  + "&boost=" + "query($expanded_query)&expanded_query=" + query_expansion


print("Simple Query Expansion:\n" + simple_expansion)
print("\nIncreased Precision, Reduced Recall Query:\n" + increase_conceptual_precision)
print("\nIncreased Precision, No Reduction in Recall:\n" + increase_precision_reduce_recall)
print("\nSlightly Increased Recall Query:\n" + slightly_increased_recall)
print("\nSame Results, Better Conceptual Ranking:\n" + same_results_better_ranking)

#TODO, actually run the search

Simple Query Expansion:
q={!edismax qf="title body" mm="0%"}vibranium  vibranium^0.92227  wakandan^0.7551  wakanda^0.75377  adamantium^0.74686  panther's^0.69938  klaue^0.68194  panther^0.65396  klaw^0.6532

Increased Precision, Reduced Recall Query:
q={!edismax qf="title body" mm="30%"}vibranium  vibranium^0.92227  wakandan^0.7551  wakanda^0.75377  adamantium^0.74686  panther's^0.69938  klaue^0.68194  panther^0.65396  klaw^0.6532

Increased Precision, No Reduction in Recall:
q={!edismax qf="title body" mm="2"}vibranium AND vibranium^0.92227  wakandan^0.7551  wakanda^0.75377  adamantium^0.74686  panther's^0.69938  klaue^0.68194  panther^0.65396  klaw^0.6532

Slightly Increased Recall Query:
q={!edismax qf="title body" mm="2"}vibranium vibranium^0.92227  wakandan^0.7551  wakanda^0.75377  adamantium^0.74686  panther's^0.69938  klaue^0.68194  panther^0.65396  klaw^0.6532

Same Results Better Conceptual Ranking:
q={!edismax qf="title body" mm="2"}vibranium&boost=query($expanded_query)&expa

## Content-based Recommendations

In [220]:
request_template = {
    "params": {
        "qf": "body",
        "fore": "{!type=$defType qf=$qf v=$q}",
        "back": "*:*",
        "relatedness_func": "relatedness($fore,$back)",
        "defType": "edismax",
        "rows": 0,
        "echoParams": "none",
        "omitHeader": "true",
        "mm": "100%"
    },
    "facet": { }
}



## Listing 5.8

In [228]:
import collections
from mergedeep import merge

print(solr_url)
collection="stackexchange"
classification="star wars"

document="""this doc contains the words luke, magneto, cyclops, darth vader, 
           princess leia, wolverine, apple, banana, galaxy, force, blaster, 
           and chloe."""

#run an entity extractor to parse out keywords to score
parsed_document = ["this", "doc", "contains", "the", "words", "luke", \
                   "magneto", "cyclops", "darth vader", "princess leia", \ 
                   "wolverine", "apple", "banana", "galaxy", "force", \
                   "blaster", "and", "chloe"]

request = {"query": classification, "params": {}, "facet": {}}

i=0
for term in parsed_document:
    i+=1
    key = "t" + str(i)
    key2 = "${" + key + "}"
    request["params"][key] = term
    request["facet"][key2] = { 
        "type": "query", 
        "q": "{!edismax qf=${qf} v=" + key2 + "}",  
        "facet": {"stats": "${relatedness_func}"}
    }


print(json.dumps(request,indent="  "))

full_request = merge(request_template, request)
search_results = requests.post(solr_url + collection + "/select", json=full_request).json()

def parse_scores(search_results):
    results = collections.OrderedDict()
    for key in search_results["facets"]:
        if key != "count" and key != "" and "stats" in search_results["facets"][key]:
            relatedness = search_results["facets"][key]["stats"]["relatedness"]
            results[key] = relatedness
    return list(reversed(sorted(results.items(), key=lambda kv: kv[1])))

scored_terms = parse_scores(search_results)

for scored_term in scored_terms:
    print (scored_term)

http://aips-solr:8983/solr/
{
  "query": "star wars",
  "params": {
    "t1": "this",
    "t2": "doc",
    "t3": "contains",
    "t4": "the",
    "t5": "words",
    "t6": "luke",
    "t7": "magneto",
    "t8": "cyclops",
    "t9": "darth vader",
    "t10": "princess leia",
    "t11": "wolverine",
    "t12": "apple",
    "t13": "banana",
    "t14": "galaxy",
    "t15": "force",
    "t16": "blaster",
    "t17": "and",
    "t18": "chloe"
  },
  "facet": {
    "${t1}": {
      "type": "query",
      "q": "{!edismax qf=${qf} v=${t1}}",
      "facet": {
        "stats": "${relatedness_func}"
      }
    },
    "${t2}": {
      "type": "query",
      "q": "{!edismax qf=${qf} v=${t2}}",
      "facet": {
        "stats": "${relatedness_func}"
      }
    },
    "${t3}": {
      "type": "query",
      "q": "{!edismax qf=${qf} v=${t3}}",
      "facet": {
        "stats": "${relatedness_func}"
      }
    },
    "${t4}": {
      "type": "query",
      "q": "{!edismax qf=${qf} v=${t4}}",
      "fac

## Listing 5.9

In [250]:
rec_query = ""

for scored_term in scored_terms:
  term = scored_term[0]
  boost = scored_term[1]
  if len(rec_query) > 0: 
    rec_query += " "
  if boost > 0.25:
    rec_query += term + "^" + str(boost)

print("Expanded Query:\n" + rec_query)

Expanded Query:
luke^0.66366 darth vader^0.6311 force^0.59269 galaxy^0.45858 blaster^0.39121 princess leia^0.25119           


## Listing 5.10

In [251]:
import collections

collection="stackexchange"

request = {
    "params": {
        "qf": "title body",
        "defType": "edismax",
        "rows": 5,
        "echoParams": "none",
        "omitHeader": "true",
        "mm": "0",
        "fl": "title",
        "fq": "title:[* TO *]" #only show docs with titles to make the example readable
    },
    "query": rec_query
}

search_results = requests.post(solr_url + collection + "/select", json=request).json()
print(json.dumps(search_results, indent="  "))

{
  "response": {
    "numFound": 2864,
    "start": 0,
    "docs": [
      {
        "title": "&quot;Help me, Obi-Wan Kenobi&quot; -- how does Leia know who Obi-Wan is?"
      },
      {
        "title": "Why couldn't Snoke or Kylo Ren trace Luke using the Force?"
      },
      {
        "title": "Did Luke know the &quot;Chosen One&quot; prophecy?"
      },
      {
        "title": "Was Darth Vader at his strongest during Episode III?"
      },
      {
        "title": "Is there evidence that Lucas intentionally introduced similarities between Episodes I and IV?"
      }
    ]
  }
}


## Exploring Arbitrary Relationships

## Listing 5.11

In [88]:
collection = "scifi"

starting_node = '"jean grey"'
relationship = '"in love with"'


request = {
    "query": starting_node,
    "params": {
        "qf": "body",
        "fore": "{!type=$defType qf=$qf v=$q}",
        "back": "*:*",
        "defType": "edismax",
        "rows": 0,
        "echoParams": "none",
        "omitHeader": "true"
    },
    "facet": {
        "in_love_with":{
            "type": "query",
            "query": "{!edismax qf=body v=$relationship}",
            "facet": {
                "terminating_nodes": {
                    "type": "terms",
                    "field": "body",
                    "mincount": 25,
                    "limit": 9,
                    "sort": { "body_relatedness": "desc"},
                    "facet": {
                        "body_relatedness": {
                            "type": "func",
                            "func": "relatedness($fore,$back)"
                        }
                    }
                }
            }
        }
    }
}

search_results = requests.post(solr_url + collection + "/select", json=request).json()

for bucket in search_results["facets"]["in_love_with"]["terminating_nodes"]["buckets"]:
  print(str(bucket["val"]) + "  " + str(bucket["body_relatedness"]["relatedness"]))

jean  0.85044
grey  0.74965
cyclops  0.61313
summers  0.60624
xavier  0.54697
wolverine  0.49361
x  0.46596
mutant  0.46248
magneto  0.43692


## Query Classification

## Listing 5.12

In [86]:
def run_query_classification(query,keywords_field="body",classification_field="category",classification_limit=5,min_occurrences=5):      
    
    classification_query = {
        "params": {
            "qf": keywords_field,
            "fore": "{!type=$defType qf=$qf v=$q}",
            "back": "*:*",
            "defType": "edismax",
            "rows": 0,
            "echoParams": "none",
            "omitHeader": "true"
        },
        "query": query,
        "facet": {
            "classification":{
                "type": "terms",
                "field": classification_field,
                "sort": { "classification_relatedness": "desc"},
                "mincount": min_occurrences, 
                "limit": classification_limit,
                "facet": {
                    "classification_relatedness": {
                        "type": "func",
                        "func": "relatedness($fore,$back)"
                    }
                }
            }
        }
    }
        
    search_results = requests.post(solr_url + collection + "/select", json=classification_query).json()  
    
    print("Query: " + query) 
    print("  Classifications: ")
    for classification_bucket in search_results["facets"]["classification"]["buckets"]:
        print("    " + str(classification_bucket["val"]) + "  " + str(classification_bucket["classification_relatedness"]["relatedness"]))
    print("\n")

run_query_classification( query="docker", classification_field="category", classification_limit=3 )
run_query_classification( query="airplane", classification_field="category", classification_limit=1 )
run_query_classification( query="airplane AND crash", classification_field="category", classification_limit=2 )
run_query_classification( query="camping", classification_field="category", classification_limit=2 )
run_query_classification( query="alien", classification_field="category", classification_limit=1 )
run_query_classification( query="passport", classification_field="category", classification_limit=1 )
run_query_classification( query="driver", classification_field="category", classification_limit=2 )
run_query_classification( query="driver AND taxi", classification_field="category", classification_limit=2 )
run_query_classification( query="driver AND install", classification_field="category", classification_limit=2 )


Query: docker
  Classifications: 
    devops  0.8376


Query: airplane
  Classifications: 
    travel  0.20591


Query: airplane AND crash
  Classifications: 
    scifi  0.01938
    travel  -0.01068


Query: camping
  Classifications: 
    outdoors  0.40323
    travel  0.10778


Query: alien
  Classifications: 
    scifi  0.51953


Query: passport
  Classifications: 
    travel  0.73494


Query: driver
  Classifications: 
    travel  0.23835
    devops  0.04461


Query: driver AND taxi
  Classifications: 
    travel  0.1525
    scifi  -0.1301


Query: driver AND install
  Classifications: 
    devops  0.1661
    travel  -0.03103




## Disambiguation

## Listing 5.13

In [51]:
def run_disambiguation_query(query,keywords_field="body",context_field="category",keywords_limit=10,context_limit=5,min_occurrences=5):      
    
    disambiguation_query = {
        "params": {
            "qf": keywords_field,
            "fore": "{!type=$defType qf=$qf v=$q}",
            "back": "*:*",
            "defType": "edismax",
            "rows": 0,
            "echoParams": "none",
            "omitHeader": "true"
        },
        "query": query,
        "facet": {
            "context":{
                "type": "terms",
                "field": context_field,
                "sort": { "context_relatedness": "desc"},
                "mincount": min_occurrences, 
                "limit": context_limit,
                "facet": {
                    "context_relatedness": {
                        "type": "func",
                        "func": "relatedness($fore,$back)"
                    },        
                    "keywords": {
                        "type": "terms",
                        "field": keywords_field,
                        "mincount": min_occurrences,
                        "limit": keywords_limit,
                        "sort": { "keywords_relatedness": "desc"},
                        "facet": {
                            "keywords_relatedness": {
                                "type": "func",
                                "func": "relatedness($fore,$back)"
                            }
                        }
                    }
                }
            }
        }
    }
        
    search_results = requests.post(solr_url + collection + "/select", json=disambiguation_query).json()  
    
    print("Query: " + query) 
    for context_bucket in search_results["facets"]["context"]["buckets"]:
        print("  Context: " + str(context_bucket["val"]) + "  " + str(context_bucket["context_relatedness"]["relatedness"]))
        print("    Keywords: ")
        for keywords_bucket in context_bucket["keywords"]["buckets"]:
            print("      " + str(keywords_bucket["val"]) + "  " + str(keywords_bucket["keywords_relatedness"]["relatedness"]))
        print ("\n")

## Listing 5.14

In [53]:
run_disambiguation_query( query="server", context_field="category", keywords_field="body" )
run_disambiguation_query( query="driver", context_field="category", keywords_field="body", context_limit=2 )
run_disambiguation_query( query="chef", context_field="category", keywords_field="body", context_limit=2 )


Query: server
  Context: devops  0.787
    Keywords: 
      server  0.91786
      servers  0.69526
      docker  0.66753
      code  0.65852
      configuration  0.60976
      deploy  0.60332
      nginx  0.5847
      jenkins  0.57877
      git  0.56514
      ssh  0.55581


  Context: scifi  -0.27326
    Keywords: 
      server  0.56847
      computer  0.16903
      computers  0.16403
      servers  0.14156
      virtual  0.12126
      communicate  0.09928
      real  0.098
      storage  0.09732
      system  0.08375
      inside  0.0771


  Context: travel  -0.28334
    Keywords: 
      server  0.74462
      tipping  0.47834
      tip  0.39491
      servers  0.30689
      vpn  0.27551
      tips  0.19982
      restaurant  0.19672
      bill  0.16507
      wage  0.1555
      restaurants  0.15309


Query: driver
  Context: travel  0.23835
    Keywords: 
      driver  0.91524
      drivers  0.68676
      taxi  0.6008
      car  0.54811
      license  0.51488
      driving  0.50301
     

## Bonus Examples (not included in chapter)

In [103]:
collection="jobs"

request = {
    "params": {
        "qf": "job_description job_title",
        "fore": "{!type=$defType qf=$qf v=$q}",
        "back": "*:*",
        "defType": "edismax",
        "rows": 0,
        "echoParams": "none",
        "omitHeader": "true"
    },
    "query": "\"spark\"",
    "facet": {
        "job_description_keywords": {
            "type": "terms",
            "field": "job_description",
            "sort": { "relatedness": "desc"},
            "facet": {
                "relatedness": {
                    "type": "func",
                    "func": "relatedness($fore,$back)"
                }
            }            
        }
    }
}

#search_results = json.dumps(requests.post(solr_url + collection + "/select", json=request).json(), indent=2)
search_results = requests.post(solr_url + collection + "/select", json=request).json()

for bucket in search_results["facets"]["job_description_keywords"]["buckets"]:
  print(str(bucket["val"]) + "  " + str(bucket["relatedness"]["relatedness"]))    

spark  0.80665
hadoop  0.59424
hive  0.52983
kafka  0.51552
impala  0.45309
streamsets  0.39341
scala  0.38564
flume  0.38401
attunity  0.37374
mapreduce  0.36195


In [106]:
collection="jobs"

request = {
    "params": {
        "qf": "job_description job_title",
        "fore": "{!type=$defType qf=$qf v=$q}",
        "back": "*:*",
        "defType": "edismax",
        "rows": 0,
        "echoParams": "none",
        "omitHeader": "true"
    },
    "query": "\"chef\"",
    "facet": {
        "job_description_keywords": {
            "type": "terms",
            "field": "job_description",
            "sort": { "relatedness": "desc"},
            "facet": {
                "relatedness": {
                    "type": "func",
                    "func": "relatedness($fore,$back)",
                    "min_popularity": 0.0005
                }
            }            
        }
    }
}

#search_results = json.dumps(requests.post(solr_url + collection + "/select", json=request).json(), indent=2)
search_results = requests.post(solr_url + collection + "/select", json=request).json()

for bucket in search_results["facets"]["job_description_keywords"]["buckets"]:
  print(str(bucket["val"]) + "  " + str(bucket["relatedness"]["relatedness"]))    

chef  0.80689
puppet  0.59501
ansible  0.52824
terraform  0.3866
jenkins  0.30455
culinary  0.25935
docker  0.25145
cd  0.2434
ci  0.23938
ruby  0.20856


## Success!

You've leveraged a semantic knowledge graph to find related terms for a query, performed query expansion based upon semantically-similar terms, explored multiple different way to impact precision and recall of queries through integrating semantically-augmented queries, generated content-based recommendations leveraging a semantic knowledge graph, explored arbitrary relationship types by traversing a semantic knowledge graph, and performed both query classification and query disambiguration using a semantic knowledge graph.

Semantic knowledge graphs can be a powerful tool for understaning user intent and interpreting both queries and content based upon meaning instead of just text kewords.

Up next: Chapter 10 - [Learning to Rank](../ch10/1.ch10-setup-TheMovieDB.ipynb)