# Phase I - Data Preparation and Modelling
<hr>
This is a <b>one time process</b> to generate vector embeddings for the document corpus using ensemble approach and store the model objects/configuration states for the searching purposes. There are 3 stages in this phase - <br>
1. Data Preparation and Wrangling <br>
2. Generation of embeddings for the document corpus for different techniques in ensemble approach<br>
3. Saving the model objects and configuration for later use<br>

## Setup

### Importing packages

In [None]:
import pandas as pd
import csv
import json
import time
import pickle
import torch
import numpy as np 
import sys
import os 
import nltk
import pandas as pd
import spacy
import scipy
import gensim
from pprint import pprint

from flask import Flask, render_template, jsonify, request

from gensim.models import KeyedVectors

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn import mixture

from scipy import spatial
from sentence_transformers import SentenceTransformer
from transformers import *
from summarizer import Summarizer

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

### Setup the nltk library

In [None]:
nltk.data.path.append('../bin/')
nltk.download('stopwords',download_dir='../bin/', quiet=True)
nltk.download('punkt',download_dir='../bin/', quiet=True)
stop_words = stopwords.words('english')
nltk.download('averaged_perceptron_tagger',download_dir='../bin/', quiet=True)
nltk.download('wordnet',download_dir='../bin/', quiet=True)
nltk.download('omw',download_dir='../bin/', quiet=True)
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

<a id="data"></a>
## Data loading and Cleaning
<hr>

### Cleaning the abstracts

In [None]:
#load the csv sources
raw_md_data = pd.read_csv('../data/metadata.csv')
raw_md_data.drop_duplicates(['abstract'], inplace=True) #drop duplicates by abstract
raw_md_data.dropna(subset=['abstract'], inplace=True) #remove missing abstracts

#Select required columns
df = raw_md_data[['cord_uid', 'title', 'abstract']]
clean_abstracts = df['abstract']

#Storing only abstracts
df.to_csv('../data/cleaned_abstracts.csv', header=False, index=False)

### Convert data into JSON format for ingestion into Elasticsearch index

In [None]:
csvfile = open('../data/cleaned_abstracts.csv', 'r', encoding='utf-8')
jsonfile = open('../data/cleaned_abstracts.json', 'w')

fieldnames = ("cord_uid", "title", "abstract") #corresponding to the required columns
reader = csv.DictReader(csvfile, fieldnames)
for row in reader:
    json.dump(row, jsonfile)
    jsonfile.write('\n')

### Helper Functions 

In [None]:
#To clean the abstracts
def clean_docs(doc_list):

    doc_df = pd.DataFrame({'document':doc_list})

    #Clean the data
    # removing everything except alphabets`
    doc_df['clean_doc'] = doc_df['document'].str.replace("[^a-zA-Z#]", " ")

    # removing short wordsstop_words = stopwords.words('english')
    doc_df['clean_doc'] = doc_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

    # make all text lowercase
    doc_df['clean_doc'] = doc_df['clean_doc'].apply(lambda x: x.lower())

    stop_words = stopwords.words('english')

    # tokenization
    tokenized_doc = doc_df['clean_doc'].apply(lambda x: x.split())

    # remove stop-words
    tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

    # de-tokenization
    detokenized_doc = []
    for i in range(len(doc_df)):

        try:
            t = ' '.join(tokenized_doc[i])
            detokenized_doc.append(t)
        except:
            print(f'Can not put {tokenized_doc[i]} back together')
            detokenized_doc.append('')


    detokenized_doc = np.array(detokenized_doc)

    return detokenized_doc

<a id="Modelling and embedding generation"></a>
## Modelling and embedding generation

<a id="Topic-Modeling"></a>
### Topic Modeling with Cosine distance
<hr>
Topic modeling is a NLP unsupervised technique for assigning particular words to clusters. These clusters can be thought of as word clouds and contain similar terms. Latent semantic analysis (LSA) and latent Dirichlet allocation (LDA) are two of the most popular topic modeling methods. Here we use LSA for its computational speed, but LDA could also be considered here. Once again we use Cosine distance with the results of the topic modeling.
<img src="https://i.ibb.co/23sp1Gb/tm-abstracts.png">

In [None]:
NUM_TOPICS= 20

def make_tm_output(doc_list,num_tf_idf_features=1000,num_compons=20):
    """
    Make output for topic modeling
    :param doc_list:
    :return:
    """
    detokenized_doc = clean_docs(doc_list)

    # #Run the model
    vectorizer = TfidfVectorizer(stop_words='english',
    max_features= num_tf_idf_features, # keep top 1000 terms
    max_df = 0.25,
    smooth_idf=True)


    tfidf_output = vectorizer.fit_transform(detokenized_doc)

    # SVD represent documents and terms in vectors
    svd_model = TruncatedSVD(n_components=num_compons, algorithm='randomized', n_iter=100, random_state=42)

    svd_model.fit(tfidf_output)

    tm_output = svd_model.fit_transform(tfidf_output)
    return tm_output, vectorizer,svd_model

def tm_doc_embed(idx):
    doc_vec = tm_output[idx,]
    if(sum(doc_vec)==0): #to avoid all zeros (cosine similarity)
        doc_vec = doc_vec + 1e-6
    return doc_vec.tolist()

def tm_query_embed(query):
    clean_query = clean_docs([query])
    tfidf_query_output = vectorizer.transform(clean_query)
    target_vec = svd_model.transform(tfidf_query_output)
    return target_vec.tolist()[0]

In [None]:
start_time = time.time()
tm_output,vectorizer,svd_model = make_tm_output(list(clean_abstracts),num_compons=NUM_TOPICS)
print('Time taken:', time.time()-start_time, 'seconds')

In [None]:
#Save models and vectors for the corpus for topic modeling
with open('../models/tm_vectors.pkl', 'wb') as f:
    pickle.dump(tm_output,f)
    
with open('../models/tm_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer,f)

with open('../models/tm_svd_model.pkl', 'wb') as f:
    pickle.dump(svd_model,f)

<a id="TF-IDF"></a>
### TF-IDF with Cosine distance
<hr>
Term Frequency-Inverse Document Frequency (TF-IDF) is a basic NLP method that determines the importance of an individual word relative to a document. That is, words are weighted based on how often then appear in a document and then inversely weighted based on how often they appear across a collection of documents.  Cosine distance (https://en.wikipedia.org/wiki/Cosine_similarity) is a common distance measure in the NLP literature and is used with many of the methods presented here. Cosine distance measures the difference in orientation. Thus, it is possible that two sentences or documents are far apart in Euclidean space but actually have similar orientations and are similar according to Cosine distance.
<a name="some-id"></a>

In [None]:
def tfidf_bow(doc_list):
    #clean the docs
    detokenized_doc = clean_docs(doc_list)
    gen_docs = [[w.lower() for w in word_tokenize(text)] for text in detokenized_doc]

    # create the dictionary
    dictionary = gensim.corpora.Dictionary(gen_docs)

    # Create bag of words
    corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
    tf_idf = gensim.models.TfidfModel(corpus)
    corpus_tfidf = tf_idf[corpus]
    lsi = gensim.models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300)
    corpus_lsi = lsi[corpus_tfidf]
    return dictionary, lsi, corpus_lsi

def tfidf_doc_embed(idx):
    doc_vec = [item[1] for item in corpus_lsi[idx]]
    if not doc_vec: #to avoid empty vectors
        doc_vec = 300*[1e-6]
    return doc_vec

def tfidf_query_embed(query):
    detokenized_compare_doc = clean_docs([query])
    gen_compare_docs = [[w.lower() for w in word_tokenize(text)] for text in detokenized_compare_doc]
    query_doc_bow = dictionary.doc2bow(gen_compare_docs[0])
    query_lsi = lsi[query_doc_bow]
    query_vec_tfidf = [item[1] for item in query_lsi]
    return query_vec_tfidf

In [None]:
start_time = time.time()
dictionary, lsi, corpus_lsi = tfidf_bow(list(clean_abstracts))
print('Time taken:', time.time()-start_time, 'seconds')

In [None]:
#Save models and vectors for the corpus for TF-IDF
with open('../models/tfidf_dict.pkl', 'wb') as f:
    pickle.dump(dictionary,f)
    
with open('../models/tfidf_lsi.pkl', 'wb') as f:
    pickle.dump(lsi,f)

with open('../models/tfidf_vectors.pkl', 'wb') as f:
    pickle.dump(corpus_lsi,f)

<a id="BERT-cos"></a>
### BERT with Cosine Distance 
<hr>
Bidirectional Encoder Representations from Transformers (BERT) is a pre-trained model developed by Google. Unlike traditional RNNs or LSTMs, which only learn in one direction, BERT is trained in both directions and thus is better at understanding context. Once again we use Cosine distance.

In [None]:
# One time task (already done and stored embeddings in 'models') 
#To generate BERT Embeddings for CORD dataset and store the embeddings
def make_bert_embeddings(doc_list):
    generic_bert_model = SentenceTransformer('../bin/models/')
    return generic_bert_model.encode(doc_list,show_progress_bar=True)


bert_corpus = list(clean_abstracts)

start_time = time.time()
# make the embeddings
corpus_embed = make_bert_embeddings(list(bert_corpus))
print(time.time()-start_time)

with open('../models/bert_corpus_embed.pkl', 'wb') as f:
    pickle.dump(corpus_embed, f)

In [None]:
def bert_doc_embed(idx):
    return bert_embeddings[idx].tolist()

def bert_query_embed(query):
    generic_bert_model = SentenceTransformer('../bin/models/')
    bert_vec = generic_bert_model.encode([query],show_progress_bar=True)
    return bert_vec[0].tolist()

# Phase II - Indexing, Searching and Summarization
<hr> 
<b>Query based recurring tasks</b> <br>
Once Phase I completes, we index the vectors and documents in Elasticsearch and perform our user searches on the given index. There are 5 stages in this phase - <br>
1. Data/Embedding Ingestion into Elasticsearch as dense vectors <br>
2. Generation of embeddings for user search input <br>
3. Matching (user input embeddings and document vectors) using Dense Vector API and score using cosine similarity <br>
4. Summarizing the relevant retrieved documents using BERT summarizer <br>
5. Displaying search results in the Flask search UI <br>

## Indexing and Searching in Elasticsearch

### Helper functions

In [None]:
#Indexing data in ES
def index_data():
    print("Creating the index.")
    client.indices.delete(index=INDEX_NAME, ignore=[404])

    with open(INDEX_FILE) as index_file:
        source = index_file.read().strip()
        client.indices.create(index=INDEX_NAME, body=source)

    docs = []
    count = 0

    with open(DATA_FILE) as data_file:
        for line in data_file:
            line = line.strip()

            doc = json.loads(line)
        
            docs.append(doc)
            count += 1

            if count % BATCH_SIZE == 0:
                index_batch(docs,count)
                docs = []
                print("Indexed {} documents.".format(count))

        if docs:
            index_batch(docs,count)
            print("Indexed {} documents.".format(count))

    client.indices.refresh(index=INDEX_NAME)
    print("Done indexing.")

def index_batch(docs,count):
    requests = []
    for i, doc in enumerate(docs):
        request = doc
        request["_op_type"] = "index"
        request["_index"] = INDEX_NAME
        request["tm_doc_vec"] = tm_doc_embed(i+count-BATCH_SIZE)
        request["tfidf_doc_vec"] = tfidf_doc_embed(i+count-BATCH_SIZE)
        request["bert_doc_vec"] = bert_doc_embed(i+count-BATCH_SIZE)
        requests.append(request)
    bulk(client, requests)

"""Intermediate func (not used independently anymore)
Created to test the code from within notebook and now, the same functionality is implemented in flask code""" 

def handle_query():
    query = input("Enter query: ")

    embedding_start = time.time()
    tm_query_vec = tm_query_embed(query)
    tfidf_query_vec = tfidf_query_embed(query)
    bert_query_vec = bert_query_embed(query)
    embedding_time = time.time() - embedding_start

    script_query = {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.tm_qv, doc['tm_doc_vec']) + cosineSimilarity(params.tfidf_qv, doc['tfidf_doc_vec']) + cosineSimilarity(params.bert_qv, doc['bert_doc_vec']) + 3.0",
                "params": {"tm_qv": tm_query_vec, "tfidf_qv": tfidf_query_vec, "bert_qv": bert_query_vec}
            }
        }
    }

    search_start = time.time()
    response = client.search(
        index=INDEX_NAME,
        body={
            "size": SEARCH_SIZE,
            "query": script_query,
            "_source": {"includes": ["title", "abstract", "url", "authors"]}
        }
    )
    search_time = time.time() - search_start

    print()
    print("{} total hits.".format(response["hits"]["total"]["value"]))
    print("embedding time: {:.2f} ms".format(embedding_time * 1000))
    print("search time: {:.2f} ms".format(search_time * 1000))
    for hit in response["hits"]["hits"]:
        print("id: {}, score: {}".format(hit["_id"], hit["_score"]))
        print(hit["_source"])
        print()
    
    return response

# Wrapper to run ingestion to ES
def run_query_loop():
    while True:
        try:
            handle_query()
        except KeyboardInterrupt:
            return

### Indexing and Querying

In [None]:
INDEX_NAME = "docs"
INDEX_FILE = "../resources/index.json"

DATA_FILE = "../data/cleaned_abstracts.json"
BATCH_SIZE = 5000

SEARCH_SIZE = 5  

GPU_LIMIT = 0.5

client = Elasticsearch()

start_time = time.time()
index_data()
print('Time taken for indexing:', time.time()-start_time, 'seconds')

#run_query_loop()

## Loading Saved Models

In [None]:
#load models and vectors for topic modeling
with open('models/tm_vectors.pkl', 'rb') as f:
    tm_output = pickle.load(f)
    
with open('models/tm_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

with open('models/tm_svd_model.pkl', 'rb') as f:
    svd_model = pickle.load(f)

In [None]:
#load models and vectors for TF-IDF
with open('models/tfidf_dict.pkl', 'rb') as f:
    dictionary = pickle.load(f)
    
with open('models/tfidf_lsi.pkl', 'rb') as f:
    lsi = pickle.load(f)

with open('models/tfidf_vectors.pkl', 'rb') as f:
    corpus_lsi = pickle.load(f)

In [None]:
#Load already saved BERT embeddings from models folder
with open('../models/bert_corpus_embed.pkl', 'rb') as f:
    bert_embeddings = pickle.load(f)

## Extractive Text Summarization

In [None]:
custom_config = AutoConfig.from_pretrained('../bin/bert/')
custom_config.output_hidden_states = True
custom_tokenizer = AutoTokenizer.from_pretrained('../bin/bert')
custom_model = AutoModel.from_pretrained('../bin/bert', config=custom_config)

#Pass the concatenated text string from top docs for summarisation
def get_summary(text):
    body = str(text)
    model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
    summary = model(body, min_length=60)
    return summary

In [None]:
#Call to summarizer
output_text = ""
for hit in response['hits']['hits']:
    selected_text = hit['_source']['abstract']
    output_text = output_text + " " + selected_text

print("Summary derived from the top 5 abstracts - \n {}".format(get_summary(output_text)))

## Searching via Flask Interface

In [None]:
app = Flask(__name__)

@app.route('/')
def index():
    return render_template('index.html')
#     return render_template('testpage.html')


@app.route('/search')
def analyser():
    query = request.args.get('q')
    
    if query:
        embedding_start = time.time()
        tm_query_vec = tm_query_embed(query)
        tfidf_query_vec = tfidf_query_embed(query)
        bert_query_vec = bert_query_embed(query)
        embedding_time = time.time() - embedding_start

        script_query = {
            "script_score": {
                "query": {"match_all": {}},
                "script": {
                    "source": "cosineSimilarity(params.tm_qv, doc['tm_doc_vec']) + cosineSimilarity(params.tfidf_qv, doc['tfidf_doc_vec']) + cosineSimilarity(params.bert_qv, doc['bert_doc_vec']) + 3.0",
                    "params": {"tm_qv": tm_query_vec, "tfidf_qv": tfidf_query_vec, "bert_qv": bert_query_vec}
                }
            }
        }

        search_start = time.time()
        response = client.search(
            index=INDEX_NAME,
            body={
                "size": SEARCH_SIZE,
                "query": script_query,
                "_source": {"includes": ["title", "abstract"]}
            }
        )
        search_time = time.time() - search_start

        #Call to summarizer
        output_text = ""
        for hit in response['hits']['hits']:
            selected_text = hit['_source']['abstract']
            output_text = output_text + " " + selected_text

        summary = get_summary(output_text)

        result_disp = {"top_docs" : response, "summary" : summary}
        return jsonify(result_disp)
    
    else:
        return 
app.run(host='0.0.0.0', port=5000)