<a href="https://colab.research.google.com/github/vizzies/NASA-Semantic-Search-Engine-for-Scientific-Literature/blob/SJ/Semantic_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# from google.colab import drive
# drive.mount(‘/content/gdrive’)

In [1]:
# GPU Setup

import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [8]:
#Creating PyDrive instance to load in data from PeTaL shared drive, follow the steps to authenticate
!pip install -U -q PyDrive 
  
from pydrive.auth import GoogleAuth 
from pydrive.drive import GoogleDrive 
from google.colab import auth 
from oauth2client.client import GoogleCredentials 
import pandas as pd
   
# Authenticate and create the PyDrive client. 
auth.authenticate_user() 
gauth = GoogleAuth() 
gauth.credentials = GoogleCredentials.get_application_default() 
drive = GoogleDrive(gauth)

In [11]:
data_link = 'https://drive.google.com/file/d/1bUSp7OZG0qataSH5Xf3cr7GCa9GKhOPb/view?usp=sharing'
id = data_link.split("/")[-2]
downloaded = drive.CreateFile({'id':id})  
downloaded.GetContentFile('arc-code-ti-publications.pkl') 
data = pd.read_pickle('arc-code-ti-publications.pkl')

# Import Data Below and Parse

In [13]:
import pandas as pd

import unicodedata

#/content/arc-code-ti-publications.pkl
# Import this into the Colab via the Files section
#with open(data, 'rb') as f:
#    pubs = pd.read_pickle(f)

pubs = data
import re
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)
    
def preprocess_text(sen):

    sentence = str(sen)

    # Removing html tags
    sentence = remove_tags(sentence)

    # Remove hyphenation if at the end of a line
    sentence = sentence.replace('-\n', '')

    # Fix ligatures
    sentence = unicodedata.normalize("NFKD", sentence)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

# Not really needed any more but will leave in and just comment out
# full_texts = []
# sentences = list(pubs['Text'])
# for sen in sentences:
#     full_texts.append(preprocess_text(str(sen)))

pubs.drop(pubs[pubs['Text'] == 'PDF error occurred'].index, inplace = True) 

pubs.drop_duplicates(subset=['Text'])

pubs['Text Processed'] = pubs.apply(lambda row: preprocess_text(row['Text']), axis=1)

pubs['Word Count'] = pubs.apply(lambda row: len(row['Text Processed'].split()), axis=1)

text_df = pubs[['Text Processed',]].copy()

print(text_df)


                                          Text Processed
Index                                                   
0      Adaptive Stress Testing of Trajectory Predicti...
1      Capturing Analyzing Requirements with FRET Dim...
3      The Ten Lockheed Martin Cyber Physical Challen...
4      Generation of Formal Requirements from Structu...
5      Formal Requirements Elicitation with FRET Dimi...
...                                                  ...
669    A Flexible Evolvable Architecture for Constell...
670    Extended Abstract General Purpose Data Driven ...
671     PARAMETRIC ANALYSIS OF HOVER TEST VEHICLE USI...
672     Bringing Web to Government Research Case Stud...
673    Online Detection and Modeling of Safety Bounda...

[666 rows x 1 columns]


In [14]:
!pip install -U sentence-transformers

Collecting sentence-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/f4/fd/0190080aa0af78d7cd5874e4e8e85f0bed9967dd387cf05d760832b95da9/sentence-transformers-0.3.8.tar.gz (66kB)
[K     |████████████████████████████████| 71kB 5.5MB/s 
[?25hCollecting transformers<3.4.0,>=3.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/19/22/aff234f4a841f8999e68a7a94bdd4b60b4cebcfeca5d67d61cd08c9179de/transformers-3.3.1-py3-none-any.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 15.1MB/s 
Collecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 53.0MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-c

## BERT Sentence Tranformers Semantic Search

In [15]:
"""
This is a simple application for sentence embeddings: semantic search
given query sentence,this finds the most similar sentence in this corpus
script outputs for various queries the top 5 most similar publications in the corpus
*Used open source code to aid in development
"""
from sentence_transformers import SentenceTransformer
import scipy.spatial
import pickle as pkl
embedder = SentenceTransformer('bert-base-nli-mean-tokens')

sentences = list(text_df['Text Processed'])

# Eaxmple query sentences
queries = ['How to evolve architecture for constellations and simulation', 'Build behavior of complex aerospace and modeling of safety']
query_embeddings = embedder.encode(queries,show_progress_bar=True)
text_embeddings = embedder.encode(sentences, show_progress_bar=True)
#
# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
closest_n = 5
print("\nTop 5 most similar sentences in corpus:")
for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], text_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("------------------------User Query: ------------------------")
    print("--",query,"--")
    print("------------------------------------------------------------")


# Print out all information for the publications related to user query and a relevancy score
    for idx, distance in results[0:closest_n]:
        print("Relevancy Score:   ", "(Score: %.0f%%)" % ((1-distance) * 100.0) , "\n" )
        row_dict = pubs.iloc[idx].to_dict() # pubs.loc[pubs.index== sentences[idx]].to_dict()
        #print(row_dict)
        print("Title:  " , row_dict["Title"]  , "\n")
        print("Authors:  " , row_dict["Authors"] , "\n")
        print("Date:  " , row_dict["Date"] , "\n")
        print("Link:  " , row_dict["Link"] , "\n")
        print("Abstract Length:  " , row_dict["Abstract Length"] , "\n")
        print("Abstract:  " , row_dict["Abstract"] , "\n")
        print("-------------------------------------------")

100%|██████████| 405M/405M [00:15<00:00, 26.2MB/s]


HBox(children=(FloatProgress(value=0.0, description='Batches', max=1.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=21.0, style=ProgressStyle(description_width…



Top 5 most similar sentences in corpus:
------------------------User Query: ------------------------
-- How to evolve architecture for constellations and simulation --
------------------------------------------------------------
Relevancy Score:    (Score: 68%) 

Title:   Fault Diagnostics and Prognostics for Large Segmented SRM's  

Authors:   Dimitry Luchinsky,Vadim Smelyanskiy,Viatcheslav Osipov,Dogan Timucin 

Date:   03/07/09 

Link:   http://ti.arc.nasa.gov/publications/245/download/ 

Abstract Length:   1544 

Abstract:   

 
Abstract—Prognostics  has  taken  center  stage  in  Condition 
Based  Maintenance  (CBM)  where  it  is  desired  to  estimate 
Remaining  Useful  Life  (RUL)  of  a  system  so  that  remedial 
measures  may  be  taken  in  advance  to  avoid  catastrophic 
events  or  unwanted  downtimes.  Validation  of  such 
predictions  is  an  important  but  difficult  proposition  and  a 
lack  of  appropriate  evaluation  methods  renders  prognostics 
meaningl