# Working with Semantic Knowledge Graphs

In [1]:
import sys
sys.path.append('..')
from aips import *
import os
import json
import collections
from IPython.display import display,HTML
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("AIPS").getOrCreate()
engine = get_engine()

## Listing 5.4

In [2]:
# %load -s skg_request solr
def skg_request(query):
    return {
        "query": query,
        "params": {
            "qf": "title body",
            "fore": "{!type=$defType qf=$qf v=$q}",
            "back": "*:*",
            "defType": "edismax",
            "rows": 0,
            "echoParams": "none",
            "omitHeader": "true"
        },
        "facet": {
            "body": {
                "type": "terms",
                "field": "body",
                "sort": { "relatedness": "desc"},
                "mincount": 2,
                "limit": 8,
                "facet": {
                    "relatedness": {
                        "type": "func",
                        "func": "relatedness($fore,$back)"
                        #"min_popularity": 0.0005
                    }
                }  
            }
        }
    }

In [3]:
health_collection = engine.get_collection("health")
query = "advil"
request = skg_request(query)
response = health_collection.search(request)
for bucket in response["facets"]["body"]["buckets"]:
    print(f'{bucket["val"]}  {bucket["relatedness"]["relatedness"]}')

advil  0.70986
motrin  0.59897
aleve  0.4662
ibuprofen  0.38264
alleve  0.36649
tylenol  0.33048
naproxen  0.31226
acetaminophen  0.17706


## Listing 5.5

In [14]:
stackexchange_collection = engine.get_collection("stackexchange")
query = "vibranium"
request = skg_request(query)
response = stackexchange_collection.search(request)
for bucket in response["facets"]["body"]["buckets"]:
    print(f'{bucket["val"]}  {bucket["relatedness"]["relatedness"]}')

vibranium  0.94218
wakandan  0.81909
adamantium  0.80659
wakanda  0.79051
alloy  0.7564
maclain  0.75539
klaw  0.75136
america's  0.73911


## Listing 5.6

## Query Augmentation

In [15]:
expansion = ""
for bucket in response["facets"]["body"]["buckets"]:
    term = bucket["val"]
    boost = bucket["relatedness"]["relatedness"]
    expansion += f"{term}^{boost} "
    
expanded_query = f"{query}^5 " + expansion
print(f"Expanded Query:\n{expanded_query}")

Expanded Query:
vibranium^5 vibranium^0.94218 wakandan^0.81909 adamantium^0.80659 wakanda^0.79051 alloy^0.7564 maclain^0.75539 klaw^0.75136 america's^0.73911 


## Listing 5.7

In [6]:
def q(mm): return 'q={!edismax qf="title body" mm=\"'+ mm + '\"}'

simple_expansion = f'{q("0%")}{query} {expansion}'
increase_conceptual_precision = f'{q("30%")}{query} {expansion}'
increase_precision_reduce_recall = f'{q("2")}{query} AND {expansion}'
slightly_increased_recall = f'{q("2")}{query} {expansion}'
same_results_better_ranking = f'{q("2")}{query} &boost=query($expanded_query)&expanded_query={expansion}'

print(f"Simple Query Expansion:\n{simple_expansion}")
print("\nIncreased Precision, Reduced Recall Query:")
print(increase_conceptual_precision)
print("\nIncreased Precision, No Reduction in Recall:")
print(increase_precision_reduce_recall)
print("\nSlightly Increased Recall Query:")
print(slightly_increased_recall)
print("\nSame Results, Better Conceptual Ranking:")
print(same_results_better_ranking)

#TODO, actually run the search

Simple Query Expansion:
q={!edismax qf="title body" mm="0%"}vibranium vibranium^0.94218 wakandan^0.81909 adamantium^0.80659 wakanda^0.79051 alloy^0.7564 maclain^0.75539 klaw^0.75136 america's^0.73911 

Increased Precision, Reduced Recall Query:
q={!edismax qf="title body" mm="30%"}vibranium vibranium^0.94218 wakandan^0.81909 adamantium^0.80659 wakanda^0.79051 alloy^0.7564 maclain^0.75539 klaw^0.75136 america's^0.73911 

Increased Precision, No Reduction in Recall:
q={!edismax qf="title body" mm="2"}vibranium AND vibranium^0.94218 wakandan^0.81909 adamantium^0.80659 wakanda^0.79051 alloy^0.7564 maclain^0.75539 klaw^0.75136 america's^0.73911 

Slightly Increased Recall Query:
q={!edismax qf="title body" mm="2"}vibranium vibranium^0.94218 wakandan^0.81909 adamantium^0.80659 wakanda^0.79051 alloy^0.7564 maclain^0.75539 klaw^0.75136 america's^0.73911 

Same Results, Better Conceptual Ranking:
q={!edismax qf="title body" mm="2"}vibranium &boost=query($expanded_query)&expanded_query=vibranium

## Content-based Recommendations

## Listing 5.8

In [7]:
def parse_scores(response):
    results = collections.OrderedDict()
    for key in response["facets"]:
        if key and key != "count" and "stats" in response["facets"][key]:
            results[key] = response["facets"][key]["stats"]["relatedness"]
    return list(reversed(sorted(results.items(), key=lambda kv: kv[1])))

stackexchange_collection = engine.get_collection("stackexchange")
classification = "star wars"
document = """this doc contains the words luke, magneto, cyclops,
              darth vader, princess leia, wolverine, apple, banana,
              galaxy, force, blaster, and chloe."""
#run an entity extractor to parse out keywords to score
parsed_document = ["this", "doc", "contains", "the", "words", "luke", \
                   "magneto", "cyclops", "darth vader", "princess leia", \
                   "wolverine", "apple", "banana", "galaxy", "force", \
                   "blaster", "and", "chloe"]
request = {"query": classification,
           "params": {
                "qf": "body",
                "fore": "{!type=$defType qf=$qf v=$q}",
                "back": "*:*",
                "relatedness_func": "relatedness($fore,$back)",
                "defType": "edismax",
            },
           "facet": {}}

for i in range(len(parsed_document)):
    key = "t" + str(i + 1)
    key2 = "${" + key + "}"
    request["params"][key] = parsed_document[i]
    request["facet"][key2] = { 
        "type": "query", 
        "q": "{!edismax qf=${qf} v=" + key2 + "}",  
        "facet": {"stats": "${relatedness_func}"}
    }

response = stackexchange_collection.search(request)
scored_terms = parse_scores(response)

print(json.dumps(request, indent="  "))
print(scored_terms)

{
  "query": "star wars",
  "params": {
    "qf": "body",
    "fore": "{!type=$defType qf=$qf v=$q}",
    "back": "*:*",
    "relatedness_func": "relatedness($fore,$back)",
    "defType": "edismax",
    "t1": "this",
    "t2": "doc",
    "t3": "contains",
    "t4": "the",
    "t5": "words",
    "t6": "luke",
    "t7": "magneto",
    "t8": "cyclops",
    "t9": "darth vader",
    "t10": "princess leia",
    "t11": "wolverine",
    "t12": "apple",
    "t13": "banana",
    "t14": "galaxy",
    "t15": "force",
    "t16": "blaster",
    "t17": "and",
    "t18": "chloe"
  },
  "facet": {
    "${t1}": {
      "type": "query",
      "q": "{!edismax qf=${qf} v=${t1}}",
      "facet": {
        "stats": "${relatedness_func}"
      }
    },
    "${t2}": {
      "type": "query",
      "q": "{!edismax qf=${qf} v=${t2}}",
      "facet": {
        "stats": "${relatedness_func}"
      }
    },
    "${t3}": {
      "type": "query",
      "q": "{!edismax qf=${qf} v=${t3}}",
      "facet": {
        "stat

## Listing 5.9

In [8]:
rec_query = " ".join(f'"{t[0]}"^{t[1]}'
                 for t in scored_terms if t[1] > 0.35)
print(f"Expanded Query:\n{rec_query}")

Expanded Query:
"force"^0.68132 "luke"^0.67651 "galaxy"^0.67116 "darth vader"^0.66252 "blaster"^0.37855


## Listing 5.10

In [9]:
request = {
    "params": {
        "qf": "title body",
        "defType": "edismax",
        "rows": 5,
        "echoParams": "none",
        "omitHeader": "true",
        "mm": "0",
        "fl": "title",
        "fq": "title:[* TO *]" #only show docs with titles to make the example readable
    },
    "query": rec_query
}

response = stackexchange_collection.search(request)
print(json.dumps(response, indent="  "))

{
  "response": {
    "numFound": 5010,
    "start": 0,
    "numFoundExact": true,
    "docs": [
      {
        "title": "Did Luke know the &quot;Chosen One&quot; prophecy?"
      },
      {
        "title": "Why couldn't Snoke or Kylo Ren trace Luke using the Force?"
      },
      {
        "title": "Did Darth Vader feel remorse for killing Jedi?"
      },
      {
        "title": "Was Darth Vader at his strongest during Episode III?"
      },
      {
        "title": "Does Kylo Ren know that Darth Vader reconciled with Luke?"
      }
    ]
  }
}


## Exploring Arbitrary Relationships

## Listing 5.11

In [10]:
scifi_collection = engine.get_collection("scifi")
starting_node = '"jean grey"'
relationship = "in love with"
request = skg_request(starting_node)
request["facet"] = {
    relationship: {
        "type": "query",
        "query": "{!edismax qf=body v=$relationship}",
        "facet": {
            "terminating_nodes": {
                "type": "terms",
                "field": "body",
                "mincount": 25,
                "limit": 9,
                "sort": { "body_relatedness": "desc"},
                "facet": {
                    "body_relatedness": {
                        "type": "func",
                        "func": "relatedness($fore,$back)"
                    }
                }
            }
        }
    }
}

response = scifi_collection.search(request)
for b in response["facets"][relationship]["terminating_nodes"]["buckets"]:
    print(f'{b["val"]}  {b["body_relatedness"]["relatedness"]}')

jean  0.85044
grey  0.74816
cyclops  0.62027
summers  0.6036
xavier  0.55494
wolverine  0.49161
x  0.46662
mutant  0.46062
magneto  0.43515


## Query Classification

### See Listing 6.1

## Disambiguation

### See Listing 6.2-6.3

## Bonus Examples (not included in chapter)

In [11]:
jobs_collection = engine.get_collection("jobs")
request = {
    "params": {
        "qf": "job_description job_title",
        "fore": "{!type=$defType qf=$qf v=$q}",
        "back": "*:*",
        "defType": "edismax",
        "rows": 0,
        "echoParams": "none",
        "omitHeader": "true"
    },
    "query": "\"spark\"",
    "facet": {
        "job_description_keywords": {
            "type": "terms",
            "field": "job_description",
            "sort": { "relatedness": "desc"},
            "facet": {
                "relatedness": {
                    "type": "func",
                    "func": "relatedness($fore,$back)"
                }
            }            
        }
    }
}

response = jobs_collection.search(request)
for bucket in response["facets"]["job_description_keywords"]["buckets"]:
    print(f'{bucket["val"]}  {bucket["relatedness"]["relatedness"]}')

spark  0.80665
hadoop  0.59424
hive  0.52983
kafka  0.51552
impala  0.45309
streamsets  0.39341
scala  0.38564
flume  0.38401
attunity  0.37374
mapreduce  0.36195


In [12]:
jobs_collection = engine.get_collection("jobs")

request = {
    "params": {
        "qf": "job_description job_title",
        "fore": "{!type=$defType qf=$qf v=$q}",
        "back": "*:*",
        "defType": "edismax",
        "rows": 0,
        "echoParams": "none",
        "omitHeader": "true"
    },
    "query": '"chef"',
    "facet": {
        "job_description_keywords": {
            "type": "terms",
            "field": "job_description",
            "sort": { "relatedness": "desc"},
            "facet": {
                "relatedness": {
                    "type": "func",
                    "func": "relatedness($fore,$back)",
                    "min_popularity": 0.0005
                }
            }            
        }
    }
}

response = jobs_collection.search(request)
for bucket in response["facets"]["job_description_keywords"]["buckets"]:
    print(f'{bucket["val"]}  {bucket["relatedness"]["relatedness"]}')

chef  0.80689
puppet  0.59501
ansible  0.52824
terraform  0.3866
jenkins  0.30455
culinary  0.25935
docker  0.25145
cd  0.2434
ci  0.23938
ruby  0.20856


## Success!

You've leveraged a semantic knowledge graph to find related terms for a query, performed query expansion based upon semantically-similar terms, explored multiple different way to impact precision and recall of queries through integrating semantically-augmented queries, generated content-based recommendations leveraging a semantic knowledge graph, explored arbitrary relationship types by traversing a semantic knowledge graph.

Semantic knowledge graphs can be a powerful tool for understaning user intent and interpreting both queries and content based upon meaning instead of just text kewords.

Up next: Chapter 6 - [Using Context to Learn Domain-specific Language ](../ch06/1.skg-classification-disambiguation.ipynb)