# Working with Semantic Knowledge Graphs

In [1]:
import sys

sys.path.append('..')
sys.path.append("webserver")

from aips import get_engine, get_semantic_knowledge_graph
import json
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("AIPS").getOrCreate()
engine = get_engine()
skg = get_semantic_knowledge_graph()

## Listing 5.4

In [14]:
# %load -s generate_request_root,generate_facets,default_node_name,validate_skg_request_input,generate_skg_request,transform_node,transform_response_facet,sort_by_relatedness_desc,traverse engine/solr/skg
def generate_request_root():
    return {
        "limit": 0,
        "params": {
            "q": "*:*",
            "fore": "{!type=$defType v=$q}",
            "back": "*:*",
            "defType": "edismax"
        },
        "facet": {}
    }

def generate_facets(name=None, values=None, field=None,
                    min_occurrences=None, limit=None,
                    min_popularity=None, default_operator="AND"):
    base_facet = {"type": "query" if values else "terms",
                  "limit": limit if limit else 10,
                  "sort": { "relatedness": "desc" },
                  "facet": {
                      "relatedness": {
                          "type": "func",
                          "func": "relatedness($fore,$back)"}}}
    if min_occurrences:
        base_facet["mincount"] = min_occurrences
    if min_popularity:
        base_facet["facet"]["relatedness"]["min_popularity"] = min_popularity
    if field:
        base_facet["field"] = field
    facets = []
    if values:
        if min_occurrences: base_facet.pop("mincount")
        if not limit: base_facet.pop("limit")
        for i, _ in enumerate(values):
            facets.append(base_facet.copy())
            op = f"q.op={default_operator} " if default_operator else ""  
            facets[i]["query"] = "{" + f'!edismax {op}qf={field} v=${name}_{i}_query' + "}"
    else:
        facets = [base_facet]
    return facets

def default_node_name(i, j):
    return "f" + str(i) + (f"_{j}" if j else "")

def validate_skg_request_input(multi_node):
    if isinstance(multi_node, list):
        map(validate_skg_request_input, multi_node)
        node_names = [node["name"] for node in multi_node]
        if len(node_names) != len(set(node_names)):
            raise ValueError("Node names must be distinct on a given level.")
    if "field" not in multi_node: # and "values" in multi_node
        raise ValueError("'field' must be provided")

def generate_skg_request(*multi_nodes):
    """Generates a faceted Solr SKG request from a set of multi-nodes. 
       A multi-node can be a single node or a collection of nodes.
       A node can contain the following params: `name`, `values`, `field`, `min_occurance` and `limit`.
       :param str name: An optional name of the node. If not provided a default will be assigned
       :param list of str value: If empty or absent, a terms facet is used. Otherwise a query facet per value is used
       :param str field: The field to query against or discover values from.
       :param int min_occurance: The mincount on the facet.
       :param int limit: The limit on the facet.
       Each subsequent node is applied as a nested facet to all parent facets."""
    map(validate_skg_request_input, multi_nodes)
    request = generate_request_root()
    parent_nodes = [request]
    for i, multi_node in enumerate(multi_nodes):
        current_facets = []
        if isinstance(multi_node, dict):
            multi_node = [multi_node]   
        for j, node in enumerate(multi_node):
            if "name" not in node:
                node["name"] = default_node_name(i, j)
            facets = generate_facets(**node)
            current_facets.extend(facets)
            for i, parent_node in enumerate(parent_nodes):
                for j, facet in enumerate(facets):
                    parent_node["facet"][f'{node["name"]}_{j}'] = facet
            if "values" in node:
                for i, value in enumerate(node["values"]):
                    request["params"][f'{node["name"]}_{i}_query'] = value
        parent_nodes = current_facets
    return request

def transform_node(node, response_params):
    relatedness = node["relatedness"]["relatedness"] if node["count"] > 0 else 0.0
    value_node = {"relatedness": relatedness}
    sub_traversals = transform_response_facet(node, response_params)
    if sub_traversals:
        value_node["traversals"] = sub_traversals
    return value_node

def transform_response_facet(node, response_params):
    ignored_keys = ["count", "relatedness", "val"]
    traversals = {}
    for full_name, data in node.items():
        if full_name in ignored_keys:
            continue
        name = full_name.removesuffix("_" + full_name.split("_")[-1])
        if name not in traversals:
            traversals[name] = {"name": name, "values": {}}
        if "buckets" in data:
            values_node = {b["val"] : transform_node(b, response_params)
                           for b in data["buckets"]}
            traversals[name]["values"] = values_node
        else:
            value_name = response_params[f"{full_name}_query"]            
            traversals[name]["values"][value_name] = transform_node(data, response_params)
    for k in traversals.keys():
        traversals[k]["values"] = sort_by_relatedness_desc(traversals[k]["values"])
    return list(traversals.values())

def sort_by_relatedness_desc(d):
    return {k: v for k, v in sorted(d.items(), key=lambda item: item[1]["relatedness"], reverse=True)}

def traverse(collection, *nodes):
    request = generate_skg_request(*nodes)
    response = collection.native_search(request)
    return {"graph": transform_response_facet(response["facets"], request["params"])}

In [3]:
def print_graph(traversal):
    for term, data in traversal["graph"][0]["values"]["advil"]["traversals"][0]["values"].items():
        print(f'{term}  {data["relatedness"]}')
    
health_collection = engine.get_collection("health")
nodes_to_traverse = [{"field": "body", 
                      "values": ["advil"]},
                     {"field": "body",
                      "min_occurrences": 2,
                      "limit": 8}]

traversal = skg.traverse(health_collection, *nodes_to_traverse)
skg_search_request = generate_skg_request(*nodes_to_traverse)

print_graph(traversal)
print(json.dumps(skg_search_request, indent=2))

advil  0.70986
motrin  0.59897
aleve  0.4662
ibuprofen  0.38264
alleve  0.36649
tylenol  0.33048
naproxen  0.31226
acetaminophen  0.17706
{
  "limit": 0,
  "params": {
    "q": "*:*",
    "fore": "{!type=$defType v=$q}",
    "back": "*:*",
    "defType": "edismax",
    "f0_0_query": "advil"
  },
  "facet": {
    "f0_0": {
      "type": "query",
      "sort": {
        "relatedness": "desc"
      },
      "facet": {
        "relatedness": {
          "type": "func",
          "func": "relatedness($fore,$back)"
        },
        "f1_0": {
          "type": "terms",
          "limit": 8,
          "sort": {
            "relatedness": "desc"
          },
          "facet": {
            "relatedness": {
              "type": "func",
              "func": "relatedness($fore,$back)"
            }
          },
          "mincount": 2,
          "field": "body"
        }
      },
      "field": "body",
      "query": "{!edismax qf=body v=$f0_0_query}"
    }
  }
}


## Listing 5.5

In [4]:
def print_graph(traversal):
    for term, data in traversal["graph"][0]["values"]["vibranium"]["traversals"][0]["values"].items():
        print(f'{term}  {data["relatedness"]}')
        
stackexchange_collection = engine.get_collection("stackexchange")
query = "vibranium"
nodes_to_traverse = [{"field": "body", "values": [query]},
                     {"field": "body", "min_occurrences": 2, "limit": 8}]

traversal = skg.traverse(stackexchange_collection, *nodes_to_traverse)

print_graph(traversal)

vibranium  0.94237
wakandan  0.8197
adamantium  0.80724
wakanda  0.79122
alloy  0.75724
maclain  0.75623
klaw  0.75222
america's  0.74002


## Listing 5.6

## Query Augmentation

In [5]:
expansion = ""
for term, stats in traversal["graph"][0]["values"]["vibranium"] \
                            ["traversals"][0]["values"].items():
    expansion += f'{term}^{stats["relatedness"]} '    
expanded_query = f"{query}^5 " + expansion

print(f"Expanded Query:\n{expanded_query}")

Expanded Query:
vibranium^5 vibranium^0.94237 wakandan^0.8197 adamantium^0.80724 wakanda^0.79122 alloy^0.75724 maclain^0.75623 klaw^0.75222 america's^0.74002 


## Listing 5.7

In [6]:
def generate_request(query, min_match=None, boost=None):
    request = {"query": query,
               "query_fields": ["title", "body"]}
    if min_match:
        request["min_match"] = min_match
    if boost:        
        request["query_boosts"] = boost
    return request
    
simple_expansion = generate_request(f"{query} {expansion}", "1")
increased_conceptual_precision = generate_request(f"{query} {expansion}", "30%")
increased_precision_same_recall = generate_request(f"{query} AND ({expansion})", "2")
slightly_increased_recall = generate_request(f"{query} {expansion}", "2")
same_results_better_ranking = generate_request(query, "2", expansion)

print(f"Simple Query Expansion:\n{simple_expansion}")
print("\nIncreased Precision, Reduced Recall Query:")
print(increased_conceptual_precision)
print("\nIncreased Precision, No Reduction in Recall:")
print(increased_precision_same_recall)
print("\nSlightly Increased Recall Query:")
print(slightly_increased_recall)
print("\nSame Results, Better Conceptual Ranking:")
print(same_results_better_ranking)

Simple Query Expansion:
{'query': "vibranium vibranium^0.94237 wakandan^0.8197 adamantium^0.80724 wakanda^0.79122 alloy^0.75724 maclain^0.75623 klaw^0.75222 america's^0.74002 ", 'query_fields': ['title', 'body'], 'min_match': '1'}

Increased Precision, Reduced Recall Query:
{'query': "vibranium vibranium^0.94237 wakandan^0.8197 adamantium^0.80724 wakanda^0.79122 alloy^0.75724 maclain^0.75623 klaw^0.75222 america's^0.74002 ", 'query_fields': ['title', 'body'], 'min_match': '30%'}

Increased Precision, No Reduction in Recall:
{'query': "vibranium AND (vibranium^0.94237 wakandan^0.8197 adamantium^0.80724 wakanda^0.79122 alloy^0.75724 maclain^0.75623 klaw^0.75222 america's^0.74002 )", 'query_fields': ['title', 'body'], 'min_match': '2'}

Slightly Increased Recall Query:
{'query': "vibranium vibranium^0.94237 wakandan^0.8197 adamantium^0.80724 wakanda^0.79122 alloy^0.75724 maclain^0.75623 klaw^0.75222 america's^0.74002 ", 'query_fields': ['title', 'body'], 'min_match': '2'}

Same Results, B

## Content-based Recommendations

## Listing 5.8

In [7]:
def extract_phrases(document):
    "Stubbed entity extraction"
    return ["this", "doc", "contains", "the", "words", "luke", 
            "magneto", "cyclops", "darth vader", "princess leia", 
            "wolverine", "apple", "banana", "galaxy", "force", 
            "blaster", "and", "chloe"]

In [15]:
def print_graph(traversal):
    for term, data in traversal["graph"][0]["values"]["star wars"]["traversals"][0]["values"].items():
        print(f'{term}  {data["relatedness"]}')

stackexchange_collection = engine.get_collection("stackexchange")
classification = "star wars"
document = """this doc contains the words luke, magneto, cyclops,
              darth vader, princess leia, wolverine, apple, banana,
              galaxy, force, blaster, and chloe."""
parsed_document = extract_phrases(document)
nodes_to_traverse = [{"values": [classification], "field": "body"},
                     {"values": parsed_document, "field": "body"}]

traversal = traverse(stackexchange_collection, *nodes_to_traverse)

print_graph(traversal)

luke  0.75212
force  0.73248
darth vader  0.69378
galaxy  0.58693
princess leia  0.50491
blaster  0.47143
this  0.19193
the  0.17519
words  0.10144
and  0.09709
contains  0.03434
doc  0.00885
chloe  0.0
cyclops  -0.01825
magneto  -0.02175
banana  -0.0319
wolverine  -0.03362
apple  -0.03894


## Listing 5.9

In [9]:
def get_scored_terms(traversal):
    return {term: data["relatedness"]
            for term, data in traversal["graph"][0]["values"]["star wars"] \
                                       ["traversals"][0]["values"].items()}

rec_query = " ".join(f'"{term}"^{score}'
                     for term, score in get_scored_terms(traversal).items()
                     if score > 0.25)

print(f"Expanded Query:\n{rec_query}")

Expanded Query:
"luke"^0.75212 "force"^0.73248 "darth vader"^0.69378 "galaxy"^0.58693 "princess leia"^0.50491 "blaster"^0.47143


## Listing 5.10

In [10]:
request = {"query": rec_query,
           "query_fields": ["title", "body"],
           "return_fields": ["title"],
           "limit": 5,
           "filters": [("title", "*")]}

response = stackexchange_collection.search(**request)

print(json.dumps(response["docs"], indent="  "))

[
  {
    "title": "At the end of Return of the Jedi, did Darth Vader learn that Princess Leia was his daughter?"
  },
  {
    "title": "Did Luke know the &quot;Chosen One&quot; prophecy?"
  },
  {
    "title": "Was Darth Vader at his strongest during Episode III?"
  },
  {
    "title": "Why couldn't Snoke or Kylo Ren trace Luke using the Force?"
  },
  {
    "title": "Does Kylo Ren know that Darth Vader reconciled with Luke?"
  }
]


## Exploring Arbitrary Relationships

## Listing 5.11

In [11]:
def print_graph(traversal):
    for term, data in traversal["graph"][0]["values"]['"jean grey"']["traversals"][0]["values"].items():
        print(f'{term}  {data["relatedness"]}')
        
scifi_collection = engine.get_collection("scifi")
starting_node = '"jean grey"'
nodes_to_traverse = [{"field": "body", "values": [starting_node]},
                     {"field": "body", "min_occurrences": 25, "limit": 9}]

traversal = skg.traverse(scifi_collection, *nodes_to_traverse)

print_graph(traversal)

jean  0.85044
grey  0.74965
cyclops  0.61313
summers  0.60624
xavier  0.54697
wolverine  0.49361
x  0.46596
mutant  0.46248
magneto  0.43692


## Query Classification

### See Listing 6.1

## Disambiguation

### See Listing 6.2-6.3

## Bonus Examples (not included in chapter)

In [12]:
def print_graph(traversal):
    for term, data in traversal["graph"][0]["values"]["spark"]["traversals"][0]["values"].items():
        print(f'{term}  {data["relatedness"]}')
        
jobs_collection = engine.get_collection("jobs")
nodes_to_traverse = [{"field": "job_description", "values": ["spark"]},
                     {"field": "job_description"}]

traversal = skg.traverse(jobs_collection, *nodes_to_traverse)

print_graph(traversal)

spark  0.80665
hadoop  0.59424
hive  0.52983
kafka  0.51552
impala  0.45309
streamsets  0.39341
scala  0.38564
flume  0.38401
attunity  0.37374
mapreduce  0.36195


In [13]:
def print_graph(traversal):
    for term, data in traversal["graph"][0]["values"]["chef"]["traversals"][0]["values"].items():
        print(f'{term}  {data["relatedness"]}')
        
jobs_collection = engine.get_collection("jobs")
nodes_to_traverse = [{"field": "job_description", "values": ["chef"]},
                     {"field": "job_description", "min_popularity": 0.0005}]

traversal = skg.traverse(jobs_collection, *nodes_to_traverse)

print_graph(traversal)

chef  0.80689
puppet  0.59501
ansible  0.52824
terraform  0.3866
jenkins  0.30455
culinary  0.25935
docker  0.25145
cd  0.2434
ci  0.23938
ruby  0.20856


## Success!

You've leveraged a semantic knowledge graph to find related terms for a query, performed query expansion based upon semantically-similar terms, explored multiple different way to impact precision and recall of queries through integrating semantically-augmented queries, generated content-based recommendations leveraging a semantic knowledge graph, explored arbitrary relationship types by traversing a semantic knowledge graph.

Semantic knowledge graphs can be a powerful tool for understaning user intent and interpreting both queries and content based upon meaning instead of just text kewords.

Up next: Chapter 6 - [Using Context to Learn Domain-specific Language ](../ch06/1.skg-classification-disambiguation.ipynb)