In [None]:
!pip install python-dotenv 
!pip install gcsfs
!pip install farm-haystack


In [2]:
import json
from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore
from haystack import Document
from datetime import datetime
import config
from haystack.nodes import FARMReader, EmbeddingRetriever

import os
import time

from dotenv import load_dotenv
import gcsfs
import certifi
import uuid
import logging


load_dotenv()
#LOADING ENVIRONMENT VARIABLES
ELASTIC_HOST = os.getenv('ELASTIC_HOST')
ELASTIC_HTTP_AUTH = os.getenv('ELASTIC_HTTP_AUTH')
ELASTIC_PASSWORD = os.getenv('ELASTIC_PASSWORD')
CLOUD_ID = os.getenv('CLOUD_ID')
ELASTIC_PORT = os.getenv('ELASTIC_PORT')

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
def get_text_documentai(doc_element: dict, document: dict):
    """
    Document AI identifies form fields by their offsets
    in document text. This function converts offsets
    to text snippets.
    """
    response = ""
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    for segment in doc_element['textAnchor']['textSegments']:
        start_index = (
            int(segment['startIndex'])
            if 'startIndex' in segment
            else 0
        )
        end_index = int(segment['endIndex'])
        response += document['text'][start_index:end_index]
    return response

In [10]:
def parse_json(uri):
    # fs = gcsfs.GCSFileSystem(project=config.PROJECT_ID)
    with open(uri) as f:
        result = json.load(f)
    if 'pages' in result.keys():
        document_pages = result['pages']
        block_list = []
        for page in document_pages:
            try:
                page_number = page['pageNumber']
                blocks = page['blocks']
                for block in blocks:
                    block_text = get_text_documentai(block['layout'], result)
                    block_list.append(Document.from_dict({'content': str(block_text),
                                                          'meta': {'json_uri': uri}
                                                          }))
            except Exception as e:
                logging.exception(e)
                continue

    return block_list

In [4]:
certifi_location = certifi.where()

In [7]:
uri_list = ["data/sustainability_doc_part1.json",
"data/sustainability_doc_part2.json",
"data/sustainability_doc_part3.json",
"data/sustainability_doc_part4.json",
"data/sustainability_doc_part5.json",
"data/sustainability_doc_part6.json",
"data/sustainability_doc_part7.json"]

In [5]:
#Create a document store
document_store = ElasticsearchDocumentStore(host=ELASTIC_HOST, port=ELASTIC_PORT,
                                                ca_certs=certifi_location, scheme='https', username="elastic", password=ELASTIC_PASSWORD, index="april15test2")


In [11]:
#Insert documents into document store
result_list = []
for uri in uri_list:
    result_dict = parse_json(uri)
    result_list.append(result_dict)
    document_store.write_documents(result_dict)


In [12]:
#Create retriever
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.nodes import EmbeddingRetriever
from haystack.pipelines import ExtractiveQAPipeline
from haystack.pipelines import DocumentSearchPipeline

retriever = EmbeddingRetriever(
    document_store=document_store,
   embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
   model_format="sentence_transformers"
)

  return self.fget.__get__(instance, owner)()


In [13]:
#Create embedding for documents
document_store.update_embeddings(retriever)

Batches: 100%|██████████| 30/30 [00:14<00:00,  2.02it/s]Docs/s]
Updating embeddings: 10000 Docs [00:24, 411.54 Docs/s]         


In [24]:
# p_retrieval = DocumentSearchPipeline(retriever)
# res = p_retrieval.run(query="What the company's approach to climate change?", params={"Retriever": {"top_k": 5000}})

Batches: 100%|██████████| 1/1 [00:01<00:00,  1.03s/it]


In [15]:
#Create reader
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)

In [19]:
#Create pipeline
extractive_pipe = ExtractiveQAPipeline(reader, retriever)
extractive_pred = extractive_pipe.run(
    #query="Does the company have a whistleblowing policy?",
    query="What the company's approach to climate change?",
    params={"Retriever": {"top_k": 100}, "Reader": {
     "top_k": 10}, "filters":""}
)
responses = []

for index, answer in enumerate(extractive_pred['answers']):
    document_id = answer.document_ids[0]
    document = [x for x in extractive_pred['documents']
                if x.id == document_id][0].to_dict()
    responses.append({'answer': answer, 'document': document})

Batches: 100%|██████████| 1/1 [00:01<00:00,  1.08s/it]
Inferencing Samples: 100%|██████████| 3/3 [00:08<00:00,  2.94s/ Batches]


In [20]:
responses

[{'answer': <Answer {'answer': 'methodically', 'type': 'extractive', 'score': 0.8745584487915039, 'context': 'rs. SAF-HOLLAND therefore\napproaches the topic of climate protection methodically. The point of\ndeparture was to draw up a life cycle assessment that ', 'offsets_in_document': [{'start': 238, 'end': 250}], 'offsets_in_context': [{'start': 69, 'end': 81}], 'document_ids': ['b94ce6a98f7d2625828d47c62b1c685e'], 'meta': {'json_uri': 'data/sustainability_doc_part3.json'}}>,
  'document': {'content': 'FROM LIFE CYCLE ASSESSMENT TO THE STRATEGY PROCESS\nGRI 305/103\nThe rapidly growing environmental and social challenges necessitate a\nclear perspective for the coming years. SAF-HOLLAND therefore\napproaches the topic of climate protection methodically. The point of\ndeparture was to draw up a life cycle assessment that considered all the\nenvironmental and climate impacts during the life cycle of a trio axle unit,\nwhich harmonises the vibrations of three axles.\n',
   'content_typ

In [None]:
responses

In [None]:
#!pip install farm-haystack --upgrade

In [None]:
#more complex pipeline

In [None]:
from haystack.pipelines import Pipeline
from haystack.nodes import  AnswerParser, PromptNode, PromptTemplate
from haystack.schema import Document

# Let's create a custom LFQA prompt using PromptTemplate
lfqa_prompt = PromptTemplate(name="lfqa",
                             prompt_text="""Synthesize a comprehensive answer from the following topk most relevant paragraphs and the given question. 
                             Provide a clear and concise response that summarizes the key points and information presented in the paragraphs. 
                             Your answer should be in your own words and be no longer than 50 words. 
                             \n\n Paragraphs: {join(documents)} \n\n Question: {query} \n\n Answer:""",
                             output_parser=AnswerParser(),) 

# These docs could also come from a retriever
# Here we explicitly specify them to avoid the setup steps for Retriever and DocumentStore
doc_1 = "Contrails are a manmade type of cirrus cloud formed when water vapor from the exhaust of a jet engine condenses on particles, which come from either the surrounding air or the exhaust itself, and freezes, leaving behind a visible trail. The exhaust can also trigger the formation of cirrus by providing ice nuclei when there is an insufficient naturally-occurring supply in the atmosphere. One of the environmental impacts of aviation is that persistent contrails can form into large mats of cirrus, and increased air traffic has been implicated as one possible cause of the increasing frequency and amount of cirrus in Earth's atmosphere."
doc_2 = "Because the aviation industry is especially sensitive to the weather, accurate weather forecasting is essential. Fog or exceptionally low ceilings can prevent many aircraft from landing and taking off. Turbulence and icing are also significant in-flight hazards. Thunderstorms are a problem for all aircraft because of severe turbulence due to their updrafts and outflow boundaries, icing due to the heavy precipitation, as well as large hail, strong winds, and lightning, all of which can cause severe damage to an aircraft in flight. Volcanic ash is also a significant problem for aviation, as aircraft can lose engine power within ash clouds. On a day-to-day basis airliners are routed to take advantage of the jet stream tailwind to improve fuel efficiency. Aircrews are briefed prior to takeoff on the conditions to expect en route and at their destination. Additionally, airports often change which runway is being used to take advantage of a headwind. This reduces the distance required for takeoff, and eliminates potential crosswinds."


# Let's initiate the PromptNode 
node = PromptNode("gpt-3.5-turbo", default_prompt_template=lfqa_prompt, api_key="YOUR_API_KEY")

pipe = Pipeline()
pipe.add_node(component=node, name="prompt_node", inputs=["Query"])

output = pipe.run(query="Why do airplanes leave contrails in the sky?", documents=[Document(doc_1), Document(doc_2)])
[a.answer for a in output["answers"]]

# Here's the answer:
["Contrails are manmade clouds formed when water vapor from the exhaust of a jet engine condenses on particles, which come from either the surrounding air or the exhaust itself, and freezes, creating a visible trail. Increased air traffic has been linked to the greater frequency and amount of these cirrus clouds in Earth's atmosphere."]