## Install packages

!pip3 install torch torchvision
!pip3 install sentence-transformers
!pip3 install lxml
!pip3 install bs4

## Imports

In [66]:
import os

import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

from bs4 import BeautifulSoup
import pickle

from sentence_transformers import SentenceTransformer, util
import torch

## Basic Configs

In [15]:
OPENSTAX_API = 'https://openstax.org/apps/archive/20220118.185250/contents/'
BIOLOGY_BOOK_ID = '8d50a0af-948b-4204-a71d-4826cba765b8@879b754'
DOWNLOAD_FOLDER = 'data/openstax_biology'

## Prepare Corpus
1. Download Openstax content (Creative Commons License)
    - For the demo I am indexing Biology 2e Textbook - https://openstax.org/details/books/biology-2e from Openstax
2. Pre-processing functions and cleanups
3. Prep metadata

In [3]:
def get_retry_strategy():
    """Returns the retry strategy.
    """
    retry_strategy = Retry(
        total=3,
        status_forcelist=[429, 500, 502, 503, 504],
        method_whitelist=["POST"],
        backoff_factor=1 #Exponential retry
    )
    return retry_strategy

def get_service():
    """Returns the http service object that can be used for calling the Openstax APIs.
    """
    retry_strategy = get_retry_strategy()
    adapter = HTTPAdapter(max_retries=retry_strategy)
    service = requests.Session()
    service.mount("https://", adapter)
    service.mount("http://", adapter)

    return service

def get_headers():
    """Get the headers.
    """
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    return 

service = get_service()
response = service.get(OPENSTAX_API + BIOLOGY_BOOK_ID + '.json', headers = get_headers())
result = response.json()

  retry_strategy = Retry(


In [131]:
# Skip sections which do not contain the meat of the content
SKIP_SECTIONS = ['introduction', 'key-terms', 'chapter-summary', 'visual-connection-questions', 'review-questions', 'critical-thinking-questions']

content_sections = []
for each_chapter in result['tree']['contents']:
    # Skip the first chapter - Preface
    if each_chapter['slug'] == 'preface':
        continue
    #print(each_chapter['slug'])
    if 'contents' in each_chapter:
        for each_section in each_chapter['contents']:
            if 'slug' in each_section:
                #print('\t' + each_section['slug'])
                for each_subsection in each_section['contents']:
                    if any(sections_to_skip in each_subsection['slug'] for sections_to_skip in SKIP_SECTIONS):
                        continue
                    #print('\t\t' + each_subsection['slug'])
                    content_sections.append(BIOLOGY_BOOK_ID + ':' + each_subsection['id'][:-1])

In [43]:
slug2title = {}

def download_content(subsection_id):
    service = get_service()
    response = service.get(OPENSTAX_API + subsection_id + '.json', headers = get_headers())
    result = response.json()
    with open(DOWNLOAD_FOLDER + os.path.sep + result['slug'] + '.html', 'w') as fd:
        fd.write(result['content'])
    slug2title[result['slug']] = result['title']
    return result['slug'], result['title'], result['content']

def xhtml_to_text(html):
    tree = BeautifulSoup(html, 'lxml')
    body = tree.body
    if body is None:
        return None
    for tag in body.select('script'):
        tag.decompose()
    for tag in body.select('style'):
        tag.decompose()
    text = body.get_text(separator='')
    return text

def get_html_content_snippets(html_content):
    snippets = []
    soup = BeautifulSoup(html_content)
    block_elements = ['ul', 'ol', 'dl', 'table', 'div', 'pre', 'blockquote', 'p']
    blocks = soup.findAll(block_elements)
    for each_block in blocks:
        if each_block.name == 'div' and each_block.get('data-type') and each_block['data-type'] == "page":
            continue
        snippets.append(each_block.text)
    return snippets

## Use SentenceBERT (miniLM) model to compute sentence embeddings

In [65]:
model = SentenceTransformer('all-MiniLM-L12-v2')

model.max_seq_length = 512

In [71]:
content = []
idx2slug = {}
no_snippets = 0
for each_section in content_sections:
    slug, title, html_content = download_content(each_section)
    snippets = get_html_content_snippets(html_content)
    content.extend(snippets)
    for i in range(len(snippets)): idx2slug[no_snippets + i] = slug
    no_snippets += len(snippets)    
content_embeddings = model.encode(content, convert_to_tensor=True)

  retry_strategy = Retry(


In [61]:
with open('content_embeddings.pkl', "wb") as fd:
    pickle.dump({'idx2slug': idx2slug, \
                 'content_embeddings': content_embeddings, \
                 'slug2title': slug2title}, \
                fd, \
                protocol=pickle.HIGHEST_PROTOCOL)

## Demo - Entity Matching using Wikipedia Content

In [117]:
def get_unique_entitites(sentence, top_k = 5):
    sentence_embedding = model.encode(sentence, convert_to_tensor=True)
    cos_scores = util.cos_sim(sentence_embedding, content_embeddings)[0]
    top_results = torch.topk(cos_scores, k=100)

    entities = []
    top_scores = []
    for score, idx in zip(top_results[0], top_results[1]):
        if len(entities) >= top_k:
            break
        entity = slug2title[idx2slug[int(idx)]]
        #print(entity)
        if entity not in entities:
            entities.append(entity)
            top_scores.append("{:.4f}".format(score))
        #print(entities)
    result = []
    for entity, score in zip(entities, top_scores):
        print('Entity: [%s], Score: [%s]' %(entity, score))
        result.append((entity, score))
    #return result

#### Entity matching where the content is an entire paragraph
- Notice that in the sentence there is no reference to the word Homeostasis, but we were able to match the sentence to the correct entity **Homeostasis**

In [122]:
# https://en.wikipedia.org/wiki/Homeostasis
sentence = '''
Some centers, such as the renin–angiotensin system, control more than one variable. When the receptor senses a stimulus, it reacts by sending action potentials to a control center. The control center sets the maintenance range—the acceptable upper and lower limits—for the particular variable, such as temperature. The control center responds to the signal by determining an appropriate response and sending signals to an effector, which can be one or more muscles, an organ, or a gland. When the signal is received and acted on, negative feedback is provided to the receptor that stops the need for further signaling.[5]
'''
get_unique_entitites(sentence)

Entity: [Homeostasis], Score: [0.6158]
Entity: [Regulation of Hormone Production], Score: [0.5783]
Entity: [Digestive System Regulation], Score: [0.5697]
Entity: [Hormonal Control of Osmoregulatory Functions], Score: [0.5549]
Entity: [Endocrine Glands], Score: [0.5425]


#### One more example of entity matching where the content is an entire paragraph
- Notice that in the sentence there is no reference to the word Bryophytes or plants, but we were able to match the sentence to the correct entity

In [121]:
# https://en.wikipedia.org/wiki/Bryophyte
sentence = '''
The sporophyte develops differently in the three groups. Both mosses and hornworts have a meristem zone where cell division occur. In hornworts, the meristem starts at the base where the foot ends, and the division of cells is pushing the sporophyte body upwards. In mosses, the meristem is located between the capsule and the top of the stalk (seta), and produce cells downward, elongating the stalk and elevates the capsule. In liverworts the meristem is absent and the elongation of the sporophyte is caused almost exclusively by cell expansion.[14]
'''
get_unique_entitites(sentence)

Entity: [Bryophytes], Score: [0.7237]
Entity: [Early Plant Life], Score: [0.6977]
Entity: [Evolution of Seed Plants], Score: [0.6817]
Entity: [The Plant Body], Score: [0.6674]
Entity: [Seedless Vascular Plants], Score: [0.6604]


#### Entity matching where the content is just a few words (very little context)
 - The results are accurate for shorter sentences too

In [123]:
sentence = '''
what are group II viruses
'''
get_unique_entitites(sentence)

Entity: [Viral Evolution, Morphology, and Classification], Score: [0.6856]
Entity: [Virus Infections and Hosts], Score: [0.4559]
Entity: [Prevention and Treatment of Viral Infections], Score: [0.4328]
Entity: [Innate Immune Response], Score: [0.4266]
Entity: [Signaling Molecules and Cellular Receptors], Score: [0.4196]


#### One more example of entity matching where the content is just a few words (very little context)
 - The results are accurate for shorter sentences too

In [126]:
sentence = '''
examples of jawed fishes
'''
get_unique_entitites(sentence)

Entity: [Fishes], Score: [0.7810]
Entity: [Chordates], Score: [0.5713]
Entity: [Amphibians], Score: [0.5438]
Entity: [Mammals], Score: [0.4904]
Entity: [Osmoregulation and Osmotic Balance], Score: [0.4704]


#### Entity matching where the similarity score is quite low (0.3885) because the paragraph is from Physics (and not from Biology)
- This is the case where we will need to create a new node in our Knowledge Graph
- The model is doing the right thing by predicting a low value of similarity. 

In [129]:
# https://en.wikipedia.org/wiki/Newton%27s_laws_of_motion

sentence = '''
Newton's first law describes objects that are in two different situations: objects that are stationary, and objects that are moving straight at a constant speed. Newton observed that objects in both situations will only change their speed if a net force is applied to them. An object which is undergoing a net force of zero is said to be at mechanical equilibrium, and Newton's first law suggests two different types of mechanical equilibrium: an object which has net forces of zero and which is not moving is at mechanical equilibrium, but an object that is moving in a straight line and with constant velocity is also at mechanical equilibrium.[4]: 140
'''
get_unique_entitites(sentence)


Entity: [The Laws of Thermodynamics], Score: [0.3885]
Entity: [Animal Form and Function], Score: [0.3536]
Entity: [Somatosensation], Score: [0.3259]
Entity: [Muscle Contraction and Locomotion], Score: [0.3240]
Entity: [Passive Transport], Score: [0.3184]
