In [3]:
#Default
import pandas as pd
pd.options.display.float_format = '{:,}'.format
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
def reset(df):
    cols = df.columns
    return df.reset_index()[cols]
def print_counts(df):
    cols = df.columns
    for each in cols:
        print(each)
        print(df[each].value_counts())
        print('______________________________________')
# ~
#Default Ending

import os
from tqdm import tqdm

print('OK !')

OK !


In [4]:
from haystack.utils import clean_wiki_text, convert_files_to_dicts, fetch_archive_from_http, print_answers
from haystack.nodes import DensePassageRetriever, EmbeddingRetriever, FARMReader, EntityExtractor
from haystack.document_stores import ElasticsearchDocumentStore

import torch

print('OK !')

OK !


# Embedding Retrieval

In [5]:
# For EmbeddingRetriever
similarity_type = "cosine"

container_name = "localhost"

er_co = ElasticsearchDocumentStore(host=container_name, port = "9200", index="production_er_bigram_collocation",
                                           similarity=similarity_type, embedding_dim=768)

er_qg = ElasticsearchDocumentStore(host=container_name, port = "9200", index="production_er_question_generation",
                                           similarity=similarity_type, embedding_dim=768)

In [6]:
from haystack.nodes import TextConverter, PDFToTextConverter, DocxToTextConverter, PreProcessor

converter = PDFToTextConverter(remove_numeric_tables=False, valid_languages = ["en"])

print('OK !')

OK !


pdftotext version 4.03 [www.xpdfreader.com]
Copyright 1996-2021 Glyph & Cog, LLC


# Set document_store 

In [7]:
document_store = er_co

In [8]:
import spacy
from itertools import combinations

# Set globals
nlp = spacy.load("en_core_web_md")

def pre_process(titles):
    """
    Pre-processes titles by removing stopwords and lemmatizing text.
    :param titles: list of strings, contains target titles,.
    :return: preprocessed_title_docs, list containing pre-processed titles.
    """

    # Preprocess all the titles
    title_docs = [nlp(x) for x in titles]
    preprocessed_title_docs = []
    lemmatized_tokens = []
    for title_doc in title_docs:
        for token in title_doc:
            if not token.is_stop:
                lemmatized_tokens.append(token.lemma_)
        preprocessed_title_docs.append(" ".join(lemmatized_tokens))
        del lemmatized_tokens[
            :
            ]  # empty the lemmatized tokens list as the code moves onto a new title

    return preprocessed_title_docs

def similarity_filter(titles):
    """
    Recursively check if titles pass a similarity filter.
    :param titles: list of strings, contains titles.
    If the function finds titles that fail the similarity test, the above param will be the function output.
    :return: this method upon itself unless there are no similar titles; in that case the feed that was passed
    in is returned.
    """

    # Preprocess titles
    preprocessed_title_docs = pre_process(titles)

    # Remove similar titles
    all_summary_pairs = list(combinations(preprocessed_title_docs, 2))
    similar_titles = []
    for pair in all_summary_pairs:
        title1 = nlp(pair[0])
        title2 = nlp(pair[1])
        similarity = title1.similarity(title2)
        if similarity > 0.8:
            similar_titles.append(pair)

    titles_to_remove = []
    for a_title in similar_titles:
        # Get the index of the first title in the pair
        index_for_removal = preprocessed_title_docs.index(a_title[0])
        titles_to_remove.append(index_for_removal)

    # Get indices of similar titles and remove them
    similar_title_counts = set(titles_to_remove)
    similar_titles = [
        x[1] for x in enumerate(titles) if x[0] in similar_title_counts
    ]

    # Exit the recursion if there are no longer any similar titles
    if len(similar_title_counts) == 0:
        return titles

    # Continue the recursion if there are still titles to remove
    else:
        # Remove similar titles from the next input
        for title in similar_titles:
            idx = titles.index(title)
            titles.pop(idx)
            
        return similarity_filter(titles)

if __name__ == "__main__":
    your_title_list = ['a title', 'the title']
    similarity_filter(your_title_list)



['the title']

In [9]:
er_retriever = EmbeddingRetriever(
   document_store=document_store,
   embedding_model="all-mpnet-base-v1",
   model_format="sentence_transformers",
   use_gpu = True,
)

INFO - haystack.modeling.utils -  Using devices: CPU
INFO - haystack.modeling.utils -  Number of GPUs: 0
INFO - haystack.nodes.retriever.dense -  Init retriever using embeddings of model all-mpnet-base-v1


In [10]:
qg_er_retriever = EmbeddingRetriever(
   document_store=er_qg,
   embedding_model="all-mpnet-base-v1",
   model_format="sentence_transformers",
   use_gpu = True,
)

INFO - haystack.modeling.utils -  Using devices: CPU
INFO - haystack.modeling.utils -  Number of GPUs: 0
INFO - haystack.nodes.retriever.dense -  Init retriever using embeddings of model all-mpnet-base-v1


# Set API Server

In [11]:
from flask import Flask, request, jsonify
from haystack.pipelines import DocumentSearchPipeline

print('OK !')

OK !


In [12]:
app = Flask(__name__)

@app.route('/haystack_collocation', methods=['POST'])
def haystack_collocation():
    query_list = []
    if request.method == 'POST':
        body = request.get_json()
        retriever_pipe = DocumentSearchPipeline(er_retriever)
        prediction = retriever_pipe.run(
            query=body['query'], params={"Retriever": {"top_k": 24},
                                         "filters": body['filters'],
                                         }
        )
        doc_list = [each.__dict__ for each in prediction['documents']]
        prediction['documents'] = doc_list
        suggested_co_list = prediction['documents']
        suggested_co_list = [each['content'] for each in suggested_co_list]
        suggested_co_list = similarity_filter(suggested_co_list)
        return {'suggested_keywords': suggested_co_list}, 201
    
@app.route('/haystack_question_generation', methods=['POST'])
def haystack_question_generation():
    query_list = []
    if request.method == 'POST':
        body = request.get_json()
        retriever_pipe = DocumentSearchPipeline(qg_er_retriever)
        prediction = retriever_pipe.run(
            query=body['query'], params={"Retriever": {"top_k": 24},
                                         "filters": body['filters'],
                                         }
        )
        doc_list = [each.__dict__ for each in prediction['documents']]
        prediction['documents'] = doc_list
        suggested_co_list = prediction['documents']
        suggested_co_list = [each['content'] for each in suggested_co_list]
        suggested_co_list = similarity_filter(suggested_co_list)
        return {'suggested_keywords': suggested_co_list}, 201

In [18]:
app.run(port = 7101)

 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


INFO - werkzeug -   * Running on http://127.0.0.1:7101/ (Press CTRL+C to quit)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO - werkzeug -  127.0.0.1 - - [30/Apr/2022 12:07:55] "[35m[1mPOST /haystack_question_generation HTTP/1.1[0m" 201 -


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO - werkzeug -  127.0.0.1 - - [30/Apr/2022 12:09:15] "[35m[1mPOST /haystack_collocation HTTP/1.1[0m" 201 -


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO - werkzeug -  127.0.0.1 - - [30/Apr/2022 12:11:46] "[35m[1mPOST /haystack_question_generation HTTP/1.1[0m" 201 -


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO - werkzeug -  127.0.0.1 - - [30/Apr/2022 12:12:58] "[35m[1mPOST /haystack_collocation HTTP/1.1[0m" 201 -
