In [None]:
from pygrok import Grok
import json
import random

def extract_date_from_string(input_string):
    date_pattern = '%{MONTHNUM:month}/%{MONTHDAY:day}/%{YEAR:year}'
    grok = Grok(date_pattern)
    date_dic = grok.match(input_string)
    return date_dic['month'] + "/" + date_dic['day'] + "/" + date_dic['year']

def get_organization(participants, organizer_string):
    for p in participants:
        if p['organization'] in organizer_string:
            return p['organization']
    return "Other"


In [None]:
# Load the data in json file
input_dir = "/mnt/sda/highspot/new_data/gong_call_transscript_with_summary_chunked/"
output_dir = "/mnt/sda/highspot/new_data/gong_call_summary_qa_format/"

dicts = []
for i in range(0, 25):
    input_file = input_dir + str(i) + ".json"
    input_data = json.load(open(input_file))
    for key in input_data:
        input_doc = input_data[key]['document']
        organization = get_organization(input_doc['meta']['participants'], input_doc['meta']['metadata']['organizer'])
        title = input_doc['meta']['metadata']['title'] # string
        date = extract_date_from_string(input_doc['meta']['metadata']['date']) # string
        doc_id = key
        for topic in input_doc['topics']:
            label = topic['label']
            summary = topic['summary']
            call_dict = {
                'summary': summary,
                'organization': organization,
                'title': title,
                'date': date,
                'label': label,
                'id': doc_id
            }
            dicts.append(call_dict)
    print(f"Processed file {i}.json")

    
output_file_path = output_dir + "all.json"
with open(output_file_path, 'w') as fp:
        json.dump(dicts, fp)

In [None]:
# Now load the data back in memory
import pandas as pd

reviews = pd.read_json(output_dir + "all.json")

summaries = reviews.summary.values
organizations = reviews.organization.values
titles = reviews.title.values
dts = reviews.date.values
labels = reviews.label.values
ids = reviews.id.values

# Convert dates from numpy to string
dates = [pd.to_datetime(str(d)).strftime('%Y-%m-%d') for d in dts]

In [None]:
data = [{'content': summary, 'meta': {'meeting_id': id_, 'organization': org, 'title' : title, 'date': date, 'label': label}}
            for summary, org, title, date, label, id_
            in zip(summaries, organizations, titles, dates, labels, ids)]
print(f"Length of dataset: {len(data)}")

In [None]:
random.choice(data)

In [None]:
# Since we’re going to work with the dense DPR retrieval method, we let the preprocessor split
# our reviews into chunks of length 100 and an overlap of five words. Although our summaries are not very long
# So this won't increase the set size by much
from haystack.nodes import PreProcessor

processor = PreProcessor(
    split_by = 'word', 
    split_length = 100,
    split_respect_sentence_boundary = False,
    split_overlap = 5)
flattened_docs = processor.process(data)

print(f"Length of dataset: {len(flattened_docs)}")
random.choice(flattened_docs)

In [None]:
# Now its time to load the data in elastic. First clean up any previous indexes.
from haystack.document_stores import ElasticsearchDocumentStore

document_store = ElasticsearchDocumentStore(host = "localhost", port = 9200, username = "elastic", 
    password = "Dkmh=pOI=CSukfWwOoxh", index = "document", scheme = "https", verify_certs = True,
    ca_certs = "/home/tanvir/work/qa-experiment/http_ca.crt")
document_store.delete_documents()
print("Initialized the elastic store")

In [None]:
# Now load the data
document_store.write_documents(flattened_docs)
print(f"Loaded {document_store.get_document_count()} documents")

In [None]:
# Now Initiate the dense passage retriever and update embeddings.
from haystack.nodes import DensePassageRetriever
retriever = DensePassageRetriever(
    document_store = document_store,
    query_embedding_model = "facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model = "facebook/dpr-ctx_encoder-single-nq-base",
    max_seq_len_query = 64,
    max_seq_len_passage = 128,
    batch_size = 16,
    use_gpu = True,
    embed_title = True,
    use_fast_tokenizers = True,
)
document_store.update_embeddings(retriever)
print("Done updating embeds")

In [None]:
# Now init the reader
from haystack.nodes import FARMReader
import os

# Supress the warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

reader = FARMReader(model_name_or_path = "deepset/roberta-base-squad2", use_gpu = True, return_no_answer = False)

In [None]:
# Set up the pipeline
from haystack.pipelines import ExtractiveQAPipeline
pipeline = ExtractiveQAPipeline(reader, retriever)

In [None]:
# Invoke the pipeline with organization filter
from haystack.utils import print_answers
import time
filter = {'organization': ['American Technologies, Inc.']}
q = "How is the admin experience?"
start_time = time.process_time()
answers = pipeline.run(q, params = {"Retriever": {"top_k": 30}, "Reader": {"top_k": 10}, "filters": filter})
elapsed_time = time.process_time() - start_time
print(f"Time taken with filter: {elapsed_time} seconds")
print_answers(answers, details = "minimal")

In [None]:
# Lets try the document search pipeline
from haystack.pipelines import DocumentSearchPipeline
from haystack.utils import print_documents

search_pipeline = DocumentSearchPipeline(retriever)
search_query = "salesforce integration"
# filter = {'organization': ['Publicis Sapient'], 'label': ['Decision Timeline']}
filter = {'organization': ['American Technologies, Inc.']}

start_time = time.process_time()
search_result = search_pipeline.run(search_query, params = {"Retriever": {"top_k": 10}, "filters": filter})
elapsed_time = time.process_time() - start_time
print(f"Time taken with multi: {elapsed_time} seconds")
print_documents(search_result, max_text_len = 512, print_name = True, print_meta = True)