<a href="https://colab.research.google.com/github/vizzies/Building-BERT-Model/blob/master/Semantic_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# from google.colab import drive
# drive.mount(‘/content/gdrive’)

In [None]:
# GPU Setup

import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


# Import Data Below and Parse

In [None]:
import pandas

import unicodedata

with open('/content/arc-code-ti-publications.pkl', 'rb') as f:
    pubs = pandas.read_pickle(f)

import re
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)
    
def preprocess_text(sen):

    sentence = str(sen)

    # Removing html tags
    sentence = remove_tags(sentence)

    # Remove hyphenation if at the end of a line
    sentence = sentence.replace('-\n', '')

    # Fix ligatures
    sentence = unicodedata.normalize("NFKD", sentence)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

# Not really needed any more but will leave in and just comment out
# full_texts = []
# sentences = list(pubs['Text'])
# for sen in sentences:
#     full_texts.append(preprocess_text(str(sen)))

pubs.drop(pubs[pubs['Text'] == 'PDF error occurred'].index, inplace = True) 

pubs.drop_duplicates(subset=['Text'])

pubs['Text Processed'] = pubs.apply(lambda row: preprocess_text(row['Text']), axis=1)

pubs['Word Count'] = pubs.apply(lambda row: len(row['Text Processed'].split()), axis=1)

text_df = pubs[['Text Processed',]].copy()

print(text_df)


                                          Text Processed
Index                                                   
0      Adaptive Stress Testing of Trajectory Predicti...
1      Capturing Analyzing Requirements with FRET Dim...
2                                     PDF error occurred
3      The Ten Lockheed Martin Cyber Physical Challen...
4      Generation of Formal Requirements from Structu...
...                                                  ...
669    A Flexible Evolvable Architecture for Constell...
670    Extended Abstract General Purpose Data Driven ...
671     PARAMETRIC ANALYSIS OF HOVER TEST VEHICLE USI...
672     Bringing Web to Government Research Case Stud...
673    Online Detection and Modeling of Safety Bounda...

[675 rows x 1 columns]


In [None]:
!pip install -U sentence-transformers

Requirement already up-to-date: sentence-transformers in /usr/local/lib/python3.6/dist-packages (0.3.6)


## BERT Sentence Tranformers Semantic Search

In [None]:
"""
This is a simple application for sentence embeddings: semantic search
We have a corpus with various sentences. Then, for a given query sentence,
we want to find the most similar sentence in this corpus.
This script outputs for various queries the top 5 most similar sentences in the corpus.
"""
from sentence_transformers import SentenceTransformer
import scipy.spatial
import pickle as pkl
embedder = SentenceTransformer('bert-base-nli-mean-tokens')

sentences = list(text_df['Text Processed'])

# Eaxmple query sentences
queries = ['How to evolve architecture for constellations and simulation', 'Build behavior of complex aerospace and modeling of safety']
query_embeddings = embedder.encode(queries,show_progress_bar=True)
text_embeddings = embedder.encode(sentences, show_progress_bar=True)
#
# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
closest_n = 5
print("\nTop 5 most similar sentences in corpus:")
for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], text_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n=========================================================")
    print("==========================Query==============================")
    print("===",query,"=====")
    print("=========================================================")


    for idx, distance in results[0:closest_n]:
        print("Score:   ", "(Score: %.4f)" % ((1-distance) * 100.0) , "%\n" )
        row_dict = pubs.iloc[idx].to_dict() # pubs.loc[pubs.index== sentences[idx]].to_dict()
        #print(row_dict)
        print("Title:  " , row_dict["Title"]  , "\n")
        print("Abstract:  " , row_dict["Abstract"] , "\n")
        print("-------------------------------------------")

HBox(children=(FloatProgress(value=0.0, description='Batches', max=1.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=22.0, style=ProgressStyle(description_width…



Top 5 most similar sentences in corpus:


=== How to evolve architecture for constellations and simulation =====
Score:    (Score: 0.6742) 

Title:   Fault Diagnostics and Prognostics for Large Segmented SRM's  

Abstract:   

 
Abstract—Prognostics  has  taken  center  stage  in  Condition 
Based  Maintenance  (CBM)  where  it  is  desired  to  estimate 
Remaining  Useful  Life  (RUL)  of  a  system  so  that  remedial 
measures  may  be  taken  in  advance  to  avoid  catastrophic 
events  or  unwanted  downtimes.  Validation  of  such 
predictions  is  an  important  but  difficult  proposition  and  a 
lack  of  appropriate  evaluation  methods  renders  prognostics 
meaningless.  Evaluation  methods  currently  used  in  the 
research community are not standardized and in many cases 
do not sufficiently assess key performance aspects expected 
out  of  a  prognostics  algorithm.  In  this  paper  we  introduce 
several  new  evaluation  metrics  tailored  for  prognostics  and 
