In [None]:
# Make sure you have a GPU running
!nvidia-smi

Sun Jan 16 11:13:22 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:

# Install the latest release of Haystack in your own environment 
#! pip install farm-haystack

# Install the latest master of Haystack
!pip install grpcio-tools==1.34.1
!pip install git+https://github.com/deepset-ai/haystack.git

# If you run this notebook on Google Colab, you might need to
# restart the runtime after installing haystack.

Collecting grpcio-tools==1.34.1
  Downloading grpcio_tools-1.34.1-cp37-cp37m-manylinux2014_x86_64.whl (2.5 MB)
[K     |████████████████████████████████| 2.5 MB 5.2 MB/s 
Installing collected packages: grpcio-tools
Successfully installed grpcio-tools-1.34.1
Collecting git+https://github.com/deepset-ai/haystack.git
  Cloning https://github.com/deepset-ai/haystack.git to /tmp/pip-req-build-eytpmsno
  Running command git clone -q https://github.com/deepset-ai/haystack.git /tmp/pip-req-build-eytpmsno
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.1 MB/s 
[?25hCollecting mlflow<=1.13.1
  Downloading mlflow-1.13.1-py3-none-any.whl (14.1 MB)
[K     |████████████████████████████████| 14.1 MB 40 kB/s 
[?25hCollecting transformers==4.13.0
  Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 48.2 MB/s 
Collecting fastapi
  Downloading fastapi-0.71.0-py3-none-any.whl (51 kB

In [None]:
from haystack.utils import clean_wiki_text, convert_files_to_dicts, fetch_archive_from_http, print_answers
from haystack.nodes import FARMReader, TransformersReader

## Document Store


In [None]:
# Recommended: Start Elasticsearch using Docker via the Haystack utility function
from haystack.utils import launch_es

launch_es()



In [None]:
# In Colab / No Docker environments: Start Elasticsearch from source
! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
! chown -R daemon:daemon elasticsearch-7.9.2

import os
from subprocess import Popen, PIPE, STDOUT
es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'],
                   stdout=PIPE, stderr=STDOUT,
                   preexec_fn=lambda: os.setuid(1)  # as daemon
                  )
# wait until ES has started
! sleep 30

In [None]:
# Connect to Elasticsearch

from haystack.document_stores import ElasticsearchDocumentStore


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Creation of document stores

In [None]:
reg_document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
legal_document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
misc_document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")

import pickle

with open("/content/drive/MyDrive/CollabData/RegulationElasticDict.pkl","rb") as f:
  dicts = pickle.load(f)

reg_document_store.write_documents(dicts)

with open("/content/drive/MyDrive/CollabData/CaseElasticDict.pkl","rb") as f:
  dicts = pickle.load(f)

legal_document_store.write_documents(dicts)

with open("/content/drive/MyDrive/CollabData/MiscElasticDict.pkl","rb") as f:
  dicts = pickle.load(f)

misc_document_store.write_documents(dicts)


In [None]:
# Document Stores with 512 size chunks

reg_document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
legal_document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
misc_document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")

import pickle

with open("/content/drive/MyDrive/CollabData/Regulation512ElasticDict.pkl","rb") as f:
  dicts = pickle.load(f)

reg_document_store.write_documents(dicts)

with open("/content/drive/MyDrive/CollabData/Case512ElasticDict.pkl","rb") as f:
  dicts = pickle.load(f)

legal_document_store.write_documents(dicts)

with open("/content/drive/MyDrive/CollabData/Misc512ElasticDict.pkl","rb") as f:
  dicts = pickle.load(f)

misc_document_store.write_documents(dicts)


## Initalize Retriever, Reader,  & Pipeline

In [None]:
from haystack.nodes import ElasticsearchRetriever

# Alternatives are 
# from haystack.nodes import TfidfRetriever
# from haystack.nodes import EmbeddingRetriever

reg_retriever = ElasticsearchRetriever(document_store=reg_document_store)
legal_retriever = ElasticsearchRetriever(document_store=legal_document_store)
misc_retriever = ElasticsearchRetriever(document_store=misc_document_store)

# reg_retriever = TfidfRetriever(document_store=reg_document_store)
# legal_retriever = TfidfRetriever(document_store=legal_document_store)
# misc_retriever = TfidfRetriever(document_store=misc_document_store)

# reg_retriever = EmbeddingRetriever(document_store=reg_document_store,embedding_model="sentence-transformers/bert-base-nli-mean-tokens")
# legal_retriever = EmbeddingRetriever(document_store=legal_document_store,embedding_model="sentence-transformers/bert-base-nli-mean-tokens")
# misc_retriever = EmbeddingRetriever(document_store=misc_document_store,embedding_model="sentence-transformers/bert-base-nli-mean-tokens")



In [None]:

reg_document_store.update_embeddings(reg_retriever)

legal_document_store.update_embeddings(legal_retriever)

misc_document_store.update_embeddings(misc_retriever)


INFO - haystack.document_stores.elasticsearch -  Updating embeddings for all 19352 docs ...


Updating embeddings:   0%|          | 0/19352 [00:00<?, ? Docs/s]


Inferencing Samples:   0%|          | 0/313 [00:00<?, ? Batches/s][A
Inferencing Samples:   0%|          | 1/313 [00:02<14:03,  2.70s/ Batches][A
Inferencing Samples:   1%|          | 2/313 [00:05<13:52,  2.68s/ Batches][A
Inferencing Samples:   1%|          | 3/313 [00:08<13:46,  2.67s/ Batches][A
Inferencing Samples:   1%|▏         | 4/313 [00:10<13:44,  2.67s/ Batches][A
Inferencing Samples:   2%|▏         | 5/313 [00:13<13:41,  2.67s/ Batches][A
Inferencing Samples:   2%|▏         | 6/313 [00:16<13:38,  2.67s/ Batches][A
Inferencing Samples:   2%|▏         | 7/313 [00:18<13:35,  2.66s/ Batches][A
Inferencing Samples:   3%|▎         | 8/313 [00:21<13:32,  2.66s/ Batches][A
Inferencing Samples:   3%|▎         | 9/313 [00:23<13:28,  2.66s/ Batches][A
Inferencing Samples:   3%|▎         | 10/313 [00:26<13:26,  2.66s/ Batches][A
Inferencing Samples:   4%|▎         | 11/313 [00:29<13:23,  2.66s/ Batches][A
Inferencing Samples:   4%|▍         | 12/313 [00:31<13:21,  2.66s/ Ba

KeyboardInterrupt: ignored

In [None]:
import pickle

# from haystack.utils import launch_milvus
# from haystack.document_stores import MilvusDocumentStore

# launch_milvus()
# reg_document_store = MilvusDocumentStore()

from haystack.document_stores import FAISSDocumentStore

reg_document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")
legal_document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")
misc_document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

with open("/content/drive/MyDrive/CollabData/RegulationElasticDict.pkl","rb") as f:
  dicts = pickle.load(f)

reg_document_store.write_documents(dicts)

with open("/content/drive/MyDrive/CollabData/CaseElasticDict.pkl","rb") as f:
  dicts = pickle.load(f)

legal_document_store.write_documents(dicts)

with open("/content/drive/MyDrive/CollabData/MiscElasticDict.pkl","rb") as f:
  dicts = pickle.load(f)

misc_document_store.write_documents(dicts)


ContextualVersionConflict: ignored

In [None]:

from haystack.nodes import DensePassageRetriever
reg_retriever = DensePassageRetriever(document_store=reg_document_store,
                                  query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                                  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                                  max_seq_len_query=128,
                                  max_seq_len_passage=512,
                                  batch_size=16,
                                  use_gpu=True,
                                  embed_title=True,
                                  use_fast_tokenizers=True)
# Important: 
# Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
# previously indexed documents and update their embedding representation. 
# While this can be a time consuming operation (depending on corpus size), it only needs to be done once. 
# At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.
reg_document_store.update_embeddings(reg_retriever)

legal_retriever = DensePassageRetriever(document_store=legal_document_store,
                                  query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                                  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                                  max_seq_len_query=128,
                                  max_seq_len_passage=512,
                                  batch_size=16,
                                  use_gpu=True,
                                  embed_title=True,
                                  use_fast_tokenizers=True)

legal_document_store.update_embeddings(legal_retriever)


misc_retriever = DensePassageRetriever(document_store=misc_document_store,
                                  query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                                  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                                  max_seq_len_query=128,
                                  max_seq_len_passage=512,
                                  batch_size=16,
                                  use_gpu=True,
                                  embed_title=True,
                                  use_fast_tokenizers=True)

misc_document_store.update_embeddings(misc_retriever)



### DataPrep

In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/CollabData/QnA_Latest.csv")
df.index.name="id"
df

Unnamed: 0_level_0,question,Answer Span,context,Document,Keywords,Start index,Annotator
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Does an innocent recipient of UPSI have any defence under the PIT regulations?,insider may prove his innocence by demonstrating the inclusive list of circu...,The report (Para 55) suggests “where a person trades on the basis of content...,1585217059979,"innocent recipient, UPSI, defence",1054.0,
1,Will a promoter group entity require a pre-clearance from a related company ...,"only by ""Designated persons"" if the value of the proposed trades is above su...","With respect to the query at 5(i), attention may be drawn to clause 6 of Sch...",0c9f4131313e54e49d7b9915993ab00e,"pre-clearance, compliance officer, designated persons, unpublished price sen...",678.0,
2,Is inter-se off-market transfer of shares between insiders within a period o...,promoters have the option to convert warrants any time within 18 months from...,"In the instant case, the said promoters have the option to convert warrants ...",0e8e621eb1aee80e102b4186ac874ed5,"inter-se off market transfer, block deal, contra trade",0.0,
3,Can an AIF invest its unutilized funds in liquid mutual funds?,may invest investment income or investment proceeds arising from sale or tra...,The provisions under Regulation 15(1)(f) is provided in the interest of inve...,1e4e946eca55a80467aad2e1546dd639,"unutilized funds, Alternate Investment Funds, Compliance",257.0,
4,What are the penal consequences of not furnishing information asked by SEBI?,attract the penalty prescribed under section 15A of the SEBI Act,The Honorable Securities Appellate Tribunal (hereinafter referred to as “SAT...,1289453383303,"Noticee, Summons, failure to Comply with summons, furnish, information",1396.0,
...,...,...,...,...,...,...,...
103,What is InvIT?,vehicles allowing for adding of projects in future in\nthe same vehicle so t...,InvITs are proposed to be vehicles allowing for adding of projects in future...,1387543144855,"regulations, guidelines, authority, markets",,Raj
104,"What does the Section 77A of the Companies Act, 1956 entail?",contains the basic framework for\ncompanies to buy back its own securities,"Section 77A of the Companies Act, 1956 contains the basic framework for\ncom...",1357124740967,"regulations, guidelines, authority",,Raj
105,"What does the Section 77A(4) of the Companies Act, 1956 say?",every buy back shall\nbe completed within a period of 12 months.,"Section 77A(4) of the Companies Act, 1956 specifies that every buy back shal...",1357124740967,"regulations, guidelines, authority, act",,Raj
106,"What does the Section 77A(2) of the Companies Act, 1956 say?",prohibits only back to back\nbuy backs through board resolution,"Section 77A(2) of the Companies Act,1956 prohibits only back to back\nbuy ba...",1357124740967,"regulations, guidelines, authority, act",,Raj


In [None]:
def start_index(span,context):
  return context.find(span)
      
df["Start index"] = df.apply(lambda x: start_index(x["Answer Span"],x["context"]),axis=1)
df[["Answer Span","context","Start index"]]

Unnamed: 0_level_0,Answer Span,context,Start index
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,insider may prove his innocence by demonstrating the inclusive list of circu...,The report (Para 55) suggests “where a person trades on the basis of content...,844
1,"only by ""Designated persons"" if the value of the proposed trades is above su...","With respect to the query at 5(i), attention may be drawn to clause 6 of Sch...",741
2,promoters have the option to convert warrants any time within 18 months from...,"In the instant case, the said promoters have the option to convert warrants ...",30
3,may invest investment income or investment proceeds arising from sale or tra...,The provisions under Regulation 15(1)(f) is provided in the interest of inve...,278
4,attract the penalty prescribed under section 15A of the SEBI Act,The Honorable Securities Appellate Tribunal (hereinafter referred to as “SAT...,248
...,...,...,...
103,vehicles allowing for adding of projects in future in\nthe same vehicle so t...,InvITs are proposed to be vehicles allowing for adding of projects in future...,26
104,contains the basic framework for\ncompanies to buy back its own securities,"Section 77A of the Companies Act, 1956 contains the basic framework for\ncom...",39
105,every buy back shall\nbe completed within a period of 12 months.,"Section 77A(4) of the Companies Act, 1956 specifies that every buy back shal...",57
106,prohibits only back to back\nbuy backs through board resolution,"Section 77A(2) of the Companies Act,1956 prohibits only back to back\nbuy ba...",41


In [None]:
df[df["Start index"]==-1]

Unnamed: 0_level_0,question,Answer Span,context,Document,Keywords,Start index,Annotator
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
80,What is the capital adequacy requirement for a Merchant banker?,not less than five crore reupees,The capital adequacy requirement referred to in clause (d) of regulation 6 s...,Merchant Bankers,,-1,Vrinda
86,What is corporate governance?,the acceptance by management of the inalienable rights of\nshareholders as t...,Corporate governance is the acceptance by management of the inalienable righ...,1292902977051,"economics, finance, markets",-1,Raj
87,What is Crowdfunding?,solicitation of funds (small amount) from multiple investors through\na web-...,Crowdfunding is solicitation of funds (small amount) from multiple investors...,1403005615257,"economics, finance, markets",-1,Raj
88,What is a green bond?,A green bond is like any other bond where a debt instrument is issued by an ...,A green bond is like any other bond where a debt instrument is issued by an ...,1449143298693,"economics, finance, markets",-1,Raj
95,How many stock exchanges in India are corporatised and demutualised?,18 recognised stock exchanges in India are corporatised and\nDemutualised,"At present, 18 recognised stock exchanges in India are corporatised and\ndem...",1293515802514,"regulations, guidelines, authority, facts",-1,Raj


In [None]:
df = df[df["Start index"]!=-1]
df = df.reset_index()
df

Unnamed: 0,id,question,Answer Span,context,Document,Keywords,Start index,Annotator
0,0,Does an innocent recipient of UPSI have any defence under the PIT regulations?,insider may prove his innocence by demonstrating the inclusive list of circu...,The report (Para 55) suggests “where a person trades on the basis of content...,1585217059979,"innocent recipient, UPSI, defence",844,
1,1,Will a promoter group entity require a pre-clearance from a related company ...,"only by ""Designated persons"" if the value of the proposed trades is above su...","With respect to the query at 5(i), attention may be drawn to clause 6 of Sch...",0c9f4131313e54e49d7b9915993ab00e,"pre-clearance, compliance officer, designated persons, unpublished price sen...",741,
2,2,Is inter-se off-market transfer of shares between insiders within a period o...,promoters have the option to convert warrants any time within 18 months from...,"In the instant case, the said promoters have the option to convert warrants ...",0e8e621eb1aee80e102b4186ac874ed5,"inter-se off market transfer, block deal, contra trade",30,
3,3,Can an AIF invest its unutilized funds in liquid mutual funds?,may invest investment income or investment proceeds arising from sale or tra...,The provisions under Regulation 15(1)(f) is provided in the interest of inve...,1e4e946eca55a80467aad2e1546dd639,"unutilized funds, Alternate Investment Funds, Compliance",278,
4,4,What are the penal consequences of not furnishing information asked by SEBI?,attract the penalty prescribed under section 15A of the SEBI Act,The Honorable Securities Appellate Tribunal (hereinafter referred to as “SAT...,1289453383303,"Noticee, Summons, failure to Comply with summons, furnish, information",248,
...,...,...,...,...,...,...,...,...
98,103,What is InvIT?,vehicles allowing for adding of projects in future in\nthe same vehicle so t...,InvITs are proposed to be vehicles allowing for adding of projects in future...,1387543144855,"regulations, guidelines, authority, markets",26,Raj
99,104,"What does the Section 77A of the Companies Act, 1956 entail?",contains the basic framework for\ncompanies to buy back its own securities,"Section 77A of the Companies Act, 1956 contains the basic framework for\ncom...",1357124740967,"regulations, guidelines, authority",39,Raj
100,105,"What does the Section 77A(4) of the Companies Act, 1956 say?",every buy back shall\nbe completed within a period of 12 months.,"Section 77A(4) of the Companies Act, 1956 specifies that every buy back shal...",1357124740967,"regulations, guidelines, authority, act",57,Raj
101,106,"What does the Section 77A(2) of the Companies Act, 1956 say?",prohibits only back to back\nbuy backs through board resolution,"Section 77A(2) of the Companies Act,1956 prohibits only back to back\nbuy ba...",1357124740967,"regulations, guidelines, authority, act",41,Raj


In [None]:
def preprocess(t,s):
  return {"answer_start":[s],"text":[t]}

df["answers"] = df.apply(lambda x: preprocess(x["Answer Span"],x["Start index"]),axis=1)
df

Unnamed: 0,id,question,Answer Span,context,Document,Keywords,Start index,Annotator,answers
0,0,Does an innocent recipient of UPSI have any defence under the PIT regulations?,insider may prove his innocence by demonstrating the inclusive list of circu...,The report (Para 55) suggests “where a person trades on the basis of content...,1585217059979,"innocent recipient, UPSI, defence",844,,"{'answer_start': [844], 'text': ['insider may prove his innocence by demonst..."
1,1,Will a promoter group entity require a pre-clearance from a related company ...,"only by ""Designated persons"" if the value of the proposed trades is above su...","With respect to the query at 5(i), attention may be drawn to clause 6 of Sch...",0c9f4131313e54e49d7b9915993ab00e,"pre-clearance, compliance officer, designated persons, unpublished price sen...",741,,"{'answer_start': [741], 'text': ['only by ""Designated persons"" if the value ..."
2,2,Is inter-se off-market transfer of shares between insiders within a period o...,promoters have the option to convert warrants any time within 18 months from...,"In the instant case, the said promoters have the option to convert warrants ...",0e8e621eb1aee80e102b4186ac874ed5,"inter-se off market transfer, block deal, contra trade",30,,"{'answer_start': [30], 'text': ['promoters have the option to convert warran..."
3,3,Can an AIF invest its unutilized funds in liquid mutual funds?,may invest investment income or investment proceeds arising from sale or tra...,The provisions under Regulation 15(1)(f) is provided in the interest of inve...,1e4e946eca55a80467aad2e1546dd639,"unutilized funds, Alternate Investment Funds, Compliance",278,,"{'answer_start': [278], 'text': ['may invest investment income or investment..."
4,4,What are the penal consequences of not furnishing information asked by SEBI?,attract the penalty prescribed under section 15A of the SEBI Act,The Honorable Securities Appellate Tribunal (hereinafter referred to as “SAT...,1289453383303,"Noticee, Summons, failure to Comply with summons, furnish, information",248,,"{'answer_start': [248], 'text': ['attract the penalty prescribed under secti..."
...,...,...,...,...,...,...,...,...,...
98,103,What is InvIT?,vehicles allowing for adding of projects in future in\nthe same vehicle so t...,InvITs are proposed to be vehicles allowing for adding of projects in future...,1387543144855,"regulations, guidelines, authority, markets",26,Raj,"{'answer_start': [26], 'text': ['vehicles allowing for adding of projects in..."
99,104,"What does the Section 77A of the Companies Act, 1956 entail?",contains the basic framework for\ncompanies to buy back its own securities,"Section 77A of the Companies Act, 1956 contains the basic framework for\ncom...",1357124740967,"regulations, guidelines, authority",39,Raj,"{'answer_start': [39], 'text': ['contains the basic framework for companies ..."
100,105,"What does the Section 77A(4) of the Companies Act, 1956 say?",every buy back shall\nbe completed within a period of 12 months.,"Section 77A(4) of the Companies Act, 1956 specifies that every buy back shal...",1357124740967,"regulations, guidelines, authority, act",57,Raj,"{'answer_start': [57], 'text': ['every buy back shall be completed within a ..."
101,106,"What does the Section 77A(2) of the Companies Act, 1956 say?",prohibits only back to back\nbuy backs through board resolution,"Section 77A(2) of the Companies Act,1956 prohibits only back to back\nbuy ba...",1357124740967,"regulations, guidelines, authority, act",41,Raj,"{'answer_start': [41], 'text': ['prohibits only back to back buy backs throu..."


In [None]:
df=df.drop(["Start index"],axis=1)
df

Unnamed: 0,id,question,Answer Span,context,Document,Keywords,Annotator,answers
0,0,Does an innocent recipient of UPSI have any defence under the PIT regulations?,insider may prove his innocence by demonstrating the inclusive list of circu...,The report (Para 55) suggests “where a person trades on the basis of content...,1585217059979,"innocent recipient, UPSI, defence",,"{'answer_start': [844], 'text': ['insider may prove his innocence by demonst..."
1,1,Will a promoter group entity require a pre-clearance from a related company ...,"only by ""Designated persons"" if the value of the proposed trades is above su...","With respect to the query at 5(i), attention may be drawn to clause 6 of Sch...",0c9f4131313e54e49d7b9915993ab00e,"pre-clearance, compliance officer, designated persons, unpublished price sen...",,"{'answer_start': [741], 'text': ['only by ""Designated persons"" if the value ..."
2,2,Is inter-se off-market transfer of shares between insiders within a period o...,promoters have the option to convert warrants any time within 18 months from...,"In the instant case, the said promoters have the option to convert warrants ...",0e8e621eb1aee80e102b4186ac874ed5,"inter-se off market transfer, block deal, contra trade",,"{'answer_start': [30], 'text': ['promoters have the option to convert warran..."
3,3,Can an AIF invest its unutilized funds in liquid mutual funds?,may invest investment income or investment proceeds arising from sale or tra...,The provisions under Regulation 15(1)(f) is provided in the interest of inve...,1e4e946eca55a80467aad2e1546dd639,"unutilized funds, Alternate Investment Funds, Compliance",,"{'answer_start': [278], 'text': ['may invest investment income or investment..."
4,4,What are the penal consequences of not furnishing information asked by SEBI?,attract the penalty prescribed under section 15A of the SEBI Act,The Honorable Securities Appellate Tribunal (hereinafter referred to as “SAT...,1289453383303,"Noticee, Summons, failure to Comply with summons, furnish, information",,"{'answer_start': [248], 'text': ['attract the penalty prescribed under secti..."
...,...,...,...,...,...,...,...,...
98,103,What is InvIT?,vehicles allowing for adding of projects in future in\nthe same vehicle so t...,InvITs are proposed to be vehicles allowing for adding of projects in future...,1387543144855,"regulations, guidelines, authority, markets",Raj,"{'answer_start': [26], 'text': ['vehicles allowing for adding of projects in..."
99,104,"What does the Section 77A of the Companies Act, 1956 entail?",contains the basic framework for\ncompanies to buy back its own securities,"Section 77A of the Companies Act, 1956 contains the basic framework for\ncom...",1357124740967,"regulations, guidelines, authority",Raj,"{'answer_start': [39], 'text': ['contains the basic framework for companies ..."
100,105,"What does the Section 77A(4) of the Companies Act, 1956 say?",every buy back shall\nbe completed within a period of 12 months.,"Section 77A(4) of the Companies Act, 1956 specifies that every buy back shal...",1357124740967,"regulations, guidelines, authority, act",Raj,"{'answer_start': [57], 'text': ['every buy back shall be completed within a ..."
101,106,"What does the Section 77A(2) of the Companies Act, 1956 say?",prohibits only back to back\nbuy backs through board resolution,"Section 77A(2) of the Companies Act,1956 prohibits only back to back\nbuy ba...",1357124740967,"regulations, guidelines, authority, act",Raj,"{'answer_start': [41], 'text': ['prohibits only back to back buy backs throu..."


In [None]:
import pickle
docs = ["Issue and Listing of Non Convertible Redeemable Preference Shares", "Investment Advisers", "Depositories and Participants", "Mutual Funds", "Employees Service", "Substantial Acquisition of Shares and Takeovers", "Appointment of Administrator and Procedure for Refunding to the Investors", "Prohibition of Fraudulent and Unfair Trade Practices relating to Securities Market", "Know Your Client Regulations", "Prohibition of Insider Trading", "Merchant Bankers", "Issue and Listing  of Securities Debt Instruments and Security Receipts", "Delisting of Equity Shares","Issue of Capital And Disclosure Requirements2", "Foreign Venture Capital Investor", "Procedure for Board Meetings", "Custodian", "Ombudsman", "Investor Protection and Education Fund", "Foreign Portfolio Investors", "Issue of Sweat Equity", "Collective Investment Scheme", "Portfolio Managers", "Research Analysts", "Procedure for Search and Seizure", "Issue of Capital And Disclosure Requirements", "Share Based Employee Benefits", "Debenture Trustees", "Alternative Investment Funds", "Stock Exchanges and Clearing Corporations", "Self Regulatory Organisations", "Settlement Proceedings", "Issues and Listing of Muncipal Debt Securities", "Buy Back Of Securities2","Issue and Listing of Debt Securities", "Infrastructure Investment Trusts", "Stock Brokers", "Listing Obligations and Disclosure Requirements", "Registrars to an Issue and Share Transfer Agents", "Real Estate Investment Trusts", "Intermediaries", "Certification of Associated Persons in the Securities Markets", "Credit Rating Agencies", "Regulatory Fee on Stock Exchanges", "Underwriters", "Buy Back Of Securities", "Bankers to an Issue", "Central Database of Market Participants"]
#Legal Case files
with open('/content/drive/MyDrive/CollabData/case_filenames.pkl','rb') as f:
    lfile = pickle.load(f) 


In [None]:
from tqdm import tqdm

def fileType(name):
    if name+".txt" in lfile:
        return "legal case"
    if name in docs:
        return "regulations"
    return "misc"

In [None]:
df["doc_type"]=df["Document"].apply(fileType)
len(df[df["doc_type"]=="regulations"]),len(df[df["doc_type"]=="misc"]),len(df[df["doc_type"]=="legal case"])

(39, 30, 34)

testing

In [None]:
DATA = "data/"
regdf = df[df["doc_type"]=="regulations"]
docs = ["Issue and Listing of Non Convertible Redeemable Preference Shares", "Investment Advisers", "Depositories and Participants", "Mutual Funds", "Employees Service", "Substantial Acquisition of Shares and Takeovers", "Appointment of Administrator and Procedure for Refunding to the Investors", "Prohibition of Fraudulent and Unfair Trade Practices relating to Securities Market", "Know Your Client Regulations", "Prohibition of Insider Trading", "Merchant Bankers", "Issue and Listing  of Securities Debt Instruments and Security Receipts", "Delisting of Equity Shares","Issue of Capital And Disclosure Requirements2", "Foreign Venture Capital Investor", "Procedure for Board Meetings", "Custodian", "Ombudsman", "Investor Protection and Education Fund", "Foreign Portfolio Investors", "Issue of Sweat Equity", "Collective Investment Scheme", "Portfolio Managers", "Research Analysts", "Procedure for Search and Seizure", "Issue of Capital And Disclosure Requirements", "Share Based Employee Benefits", "Debenture Trustees", "Alternative Investment Funds", "Stock Exchanges and Clearing Corporations", "Self Regulatory Organisations", "Settlement Proceedings", "Issues and Listing of Muncipal Debt Securities", "Buy Back Of Securities2","Issue and Listing of Debt Securities", "Infrastructure Investment Trusts", "Stock Brokers", "Listing Obligations and Disclosure Requirements", "Registrars to an Issue and Share Transfer Agents", "Real Estate Investment Trusts", "Intermediaries", "Certification of Associated Persons in the Securities Markets", "Credit Rating Agencies", "Regulatory Fee on Stock Exchanges", "Underwriters", "Buy Back Of Securities", "Bankers to an Issue", "Central Database of Market Participants"]
with open(DATA + 'cleanedregulations48.pkl','rb') as f:
    docregs = pickle.load(f)    

regMap = dict()
for key,val in zip(docs,docregs):
  regMap[key]=val

missing=list()
count=0
for i, r in regdf.iterrows():
  flag = False
  for ele in regMap[r["Document"]]:
    
    if " ".join(r["Answer Span"].split()) in " ".join(ele.split()):
      flag=True
      break
  if not flag:
    missing.append(r["id"])
    count+=1
# count  


In [None]:
regdf[regdf["id"].isin(missing)]

Unnamed: 0,id,question,Answer Span,context,Document,Keywords,Annotator,answers,doc_type
11,11,What are the conditions to providing aid to the Investors Association?,(a) that the aid shall not exceed seventy five per cent. of the total expend...,"Conditions for Aid.\nThe aid to investors’ associations, as referred to in c...",Investor Protection and Education Fund,,,"{'answer_start': [239], 'text': ['(a) that the aid shall not exceed seventy ...",regulations
15,15,What are the situations wherein the Investigating Authority under PFUFT regu...,reasonable grounds to believe that such company has been conducting in viola...,"6. Without prejudice to the powers conferred under the Act, the Investigatin...",Prohibition of Fraudulent and Unfair Trade Practices relating to Securities ...,,,"{'answer_start': [568], 'text': ['reasonable grounds to believe that such co...",regulations
30,30,What is the minimum net worth requirement in a sponsor under the infrastruct...,each sponsor has a net worth of not less than Rs. 100 crore if it is a body ...,"4. (1) For the purpose of the grant of certificate to 57[a trust], the Board...",Infrastructure Investment Trusts,,,"{'answer_start': [1013], 'text': ['each sponsor has a net worth of not less ...",regulations
32,32,Who all shall the Selection Committee for Ombudsman consist of?,(i) an expert in the area relating to financial market operations to be nomi...,"3. (1) With effect from such date as the Board may, by an order fix, there s...",Ombudsman,,,"{'answer_start': [447], 'text': ['(i) an expert in the area relating to fina...",regulations
59,59,What is convertible security under SAST regulations,a security which is convertible into or exchangeable with equity shares of ...,“convertible security” means a security which is convertible into or exchang...,Substantial Acquisition of Shares and Takeovers,,Vrinda,"{'answer_start': [28], 'text': [' a security which is convertible into or ex...",regulations
66,66,What is the definition of appreciation under the Share Based Employee Benefits?,difference between the market price of the share of a company on the date of...,. “appreciation” means the difference between the market price of the share ...,Share Based Employee Benefits,,Vrinda,"{'answer_start': [27], 'text': ['difference between the market price of the ...",regulations
67,67,What is Employee Stock Option Scheme or ESOS?,a scheme under which a company grants employee stock option directly or thr...,“employee stock option scheme or ESOS” means a scheme under which a company ...,Share Based Employee Benefits,,Vrinda,"{'answer_start': [44], 'text': [' a scheme under which a company grants empl...",regulations
74,74,What is a bonus issue under the Real Estate Investment Trust regulations?,additional units allotted to the unit holders as on the record date fixed,“bonus issue” means additional units allotted to the unit holders as on the...,Real Estate Investment Trusts,,Vrinda,"{'answer_start': [21], 'text': ['additional units allotted to the unit holde...",regulations
75,75,What is a preferrential issue under the Real Estate Investment Trust Regulat...,issue of units by a listed REIT to any select person or group of persons on...,“preferential issue” means an issue of units by a listed REIT to any select...,Real Estate Investment Trusts,,Vrinda,"{'answer_start': [30], 'text': [' issue of units by a listed REIT to any sel...",regulations
76,76,What is the Right of First Refusal?,right given to the REIT by a person to enter into a transaction with it bef...,"“right-of-first-refusal” or ""ROFR"" of a REIT means the right given to the R...",Real Estate Investment Trusts,,Vrinda,"{'answer_start': [55], 'text': [' right given to the REIT by a person to ent...",regulations


In [None]:
regMap["Listing Obligations and Disclosure Requirements"]

# answer spans present in Definitions section of regulations anol will not be in the pickles
# "Ombudsman" question in id 32 is an example where the regulation, due to the pdf parser, has the text not present as it is in the pdf
# ok the definitions part is a good majority of the missing files causes


In [None]:
set(regdf["Document"].tolist())

In [None]:
miscdf = df[df["doc_type"]=="misc"]


with open('data/misc_filenames_testing.pkl','rb') as filer:
    lcfiles = pickle.load(filer)

with open( 'data/misc_origreg_testing.pkl','rb') as filer:
    origreg = pickle.load(filer)

miscMap = dict()
for key,val in zip(lcfiles,origreg):
  if key not in miscMap.keys():
    miscMap[key]=list()
  miscMap[key].append(val)

missingmisc = list()
count=0
for i, x in miscdf.iterrows():
  flag = False
  for ele in miscMap[x["Document"]]:
    for subreg in ele:
      if x["Answer Span"] in subreg:
        flag=True
        break
  if not flag:
    missingmisc.append(x["id"])
    count+=1
count


30

In [None]:
# miscMap["9f6295e22d780cad4195bcbf8cba3d4b"]
miscdf[miscdf["id"].isin(missingmisc)]
# for x in miscMap["9f6295e22d780cad4195bcbf8cba3d4b"]:
#   print(x)
#   print("\n")

Unnamed: 0,id,question,Answer Span,context,Document,Keywords,Annotator,answers,doc_type
1,1,Will a promoter group entity require a pre-clearance from a related company ...,"only by ""Designated persons"" if the value of the proposed trades is above su...","With respect to the query at 5(i), attention may be drawn to clause 6 of Sch...",0c9f4131313e54e49d7b9915993ab00e,"pre-clearance, compliance officer, designated persons, unpublished price sen...",,"{'answer_start': [741], 'text': ['only by ""Designated persons"" if the value ...",misc
2,2,Is inter-se off-market transfer of shares between insiders within a period o...,promoters have the option to convert warrants any time within 18 months from...,"In the instant case, the said promoters have the option to convert warrants ...",0e8e621eb1aee80e102b4186ac874ed5,"inter-se off market transfer, block deal, contra trade",,"{'answer_start': [30], 'text': ['promoters have the option to convert warran...",misc
3,3,Can an AIF invest its unutilized funds in liquid mutual funds?,may invest investment income or investment proceeds arising from sale or tra...,The provisions under Regulation 15(1)(f) is provided in the interest of inve...,1e4e946eca55a80467aad2e1546dd639,"unutilized funds, Alternate Investment Funds, Compliance",,"{'answer_start': [278], 'text': ['may invest investment income or investment...",misc
7,7,What is the mission of SEBI?,make India as one of the best securities market of the world,The mission of SEBI is to make India as one of the best securities market of...,1292831218021,"SEBI, Mission",,"{'answer_start': [26], 'text': ['make India as one of the best securities ma...",misc
8,8,What is Market Participants and Investors Identification Numbers (MAP IN)?,MAPIN is proposed to be a central electronic integrated database of market p...,The Central Database of Securities Market Participants’ and Investors’ Ident...,1292909401123,"Market Participants??and Investors??Identification Numbers, MAPIN",,"{'answer_start': [297], 'text': ['MAPIN is proposed to be a central electron...",misc
20,20,Can an investment manager manage multiple InvIT?,There is no explicit provision under InvITs Regulations that prohibits an In...,"With reference to your query mentioned at para 3(i) above, it may be noted t...",2b115ec2c59c57dd398c3cbb94c821b4,"InvIT, Investment Manager, multiple InvITs",,"{'answer_start': [1019], 'text': ['There is no explicit provision under InvI...",misc
21,21,How many years should the promoters be shareholders to claim open offer exem...,for not less than three years prior to the proposed acquisition.,"As per Regulation 10(1)(a)(ii) of the Takeover Regulations, one of the condi...",250770860073d543dce2d50101ca7c1a,"three years, inter-se transfer, promoters,",,"{'answer_start': [401], 'text': ['for not less than three years prior to the...",misc
22,22,What is a material subsidiary?,if the income of a subsidiary exceeds 20% of the consolidated income of the ...,2.3 Regulation 16(1)(c) of Listing Regulations defines a material subsidiary...,ee924aa3b9739b5d32a46ae32f8f1fa4,"material subsidiary, 20%, twenty percent",,"{'answer_start': [423], 'text': ['if the income of a subsidiary exceeds 20% ...",misc
23,23,Who is responsible for formulating the code of conduct as per PIT regulations?,the board of directors of every listed company and market intermediary,"d. In this regard, regulation 9 of the PIT Regulations states that the board...",cf54516690143259b41bdd72ef5b1f94,"code of conduct, board, members",,"{'answer_start': [67], 'text': ['the board of directors of every listed comp...",misc
24,24,Can the Trust Deed be modified if it does not affect the interest of the uni...,The schedule does not contemplate a scenario where the modification may not ...,3. Clause 17 of the Third Schedule states the following:\n??17) The Trust De...,46dfd3610a07c100ab6f8a6fe927ad33,"unit holder, third schedule, modification, trust deed",,"{'answer_start': [389], 'text': ['The schedule does not contemplate a scenar...",misc


In [None]:
miscMap["1387543144855"]

['C\x00o\x00n\x00s\x00u\x00l\x00t\x00a\x00t\x00i\x00o\x00n\x00 \x00p\x00a\x00p\x00e\x00r\x00 \x00o\x00n\x00 \x00I\x00n\x00f\x00r\x00a\x00s\x00t\x00r\x00u\x00c\x00t\x00u\x00r\x00e\x00 \x00I\x00n\x00v\x00e\x00s\x00t\x00m\x00e\x00n\x00t\x00 \x00T\x00r\x00u\x00s\x00t\x00s\x00\x00\x001\x00.\x00 \x00I\x00n\x00f\x00r\x00a\x00s\x00t\x00r\x00u\x00c\x00t\x00u\x00r\x00e\x00 \x00i\x00s\x00 \x00t\x00h\x00e\x00 \x00c\x00o\x00r\x00n\x00e\x00r\x00s\x00t\x00o\x00n\x00e\x00 \x00o\x00f\x00 \x00d\x00e\x00v\x00e\x00l\x00o\x00p\x00m\x00e\x00n\x00t\x00 \x00o\x00f\x00 \x00a\x00n\x00y\x00 \x00c\x00o\x00u\x00n\x00t\x00r\x00y\x00.\x00 \x00A\x00c\x00c\x00o\x00r\x00d\x00i\x00n\x00g\x00 \x00t\x00o\x00 \x00t\x00h\x00e\x00 \x001\x002\x00t\x00h\x00 \x00F\x00i\x00v\x00e\x00 \x00Y\x00e\x00a\x00r\x00 \x00P\x00l\x00a\x00n\x00,\x00 \x00I\x00n\x00d\x00i\x00a\x00 \x00r\x00e\x00q\x00u\x00i\x00r\x00e\x00s\x00 \x00a\x00n\x00 \x00i\x00n\x00v\x00e\x00s\x00t\x00m\x00e\x00n\x00t\x00 \x00i\x00n\x00 \x00I\x00n\x00f\x00r\x00a\x00s\x00

In [None]:
legaldf = df[df["doc_type"]=="legal case"]

with open('data/legal_filenames_testing.pkl','rb') as filer:
    lcfiles = pickle.load(filer)

with open( 'data/legal_origreg_testing.pkl','rb') as filer:
    origreg = pickle.load(filer)

legalMap = dict()
for key,val in zip(lcfiles,origreg):
  if key not in legalMap.keys():
    legalMap[key]=list()
  legalMap[key].append(val)


count=0
legalmissing = list()
for i, x in legaldf.iterrows():
  flag = False
  for ele in legalMap[x["Document"]+".pdf"]:
    if flag:
      break

    for subreg in ele:
      if x["Answer Span"] in subreg:
        flag=True
        break


  if not flag:
    legalmissing.append(x["id"])
    count+=1
count


34

In [None]:
legaldf[legaldf["id"].isin(legalmissing)]


Unnamed: 0,id,question,Answer Span,context,Document,Keywords,Annotator,answers,doc_type
0,0,Does an innocent recipient of UPSI have any defence under the PIT regulations?,insider may prove his innocence by demonstrating the inclusive list of circu...,The report (Para 55) suggests “where a person trades on the basis of content...,1585217059979,"innocent recipient, UPSI, defence",,"{'answer_start': [844], 'text': ['insider may prove his innocence by demonst...",legal case
4,4,What are the penal consequences of not furnishing information asked by SEBI?,attract the penalty prescribed under section 15A of the SEBI Act,The Honorable Securities Appellate Tribunal (hereinafter referred to as “SAT...,1289453383303,"Noticee, Summons, failure to Comply with summons, furnish, information",,"{'answer_start': [248], 'text': ['attract the penalty prescribed under secti...",legal case
5,5,Who is an acquirer under SAST?,"any person who, directly or indirectly, acquires or agrees to acquire shares...","In terms of regulation 7(1) of SAST, the obligation of disclosure is cast up...",1290154724736_NEG,"Acquirer, SAST,",,"{'answer_start': [292], 'text': ['any person who, directly or indirectly, ac...",legal case
6,6,How is acquisition under PIT Regulations related to acquirer under SAST Regu...,not been defined either under PIT or SAST,"As regards regulation 13 (1) of PIT, the disclosure has to be made in Form A...",1290154724736_NEG,"acquistion, Acquirer",,"{'answer_start': [287], 'text': ['not been defined either under PIT or SAST']}",legal case
9,9,What are the various provisions of Section 4(2) of the PFUTP?,prohibits a person from indulging in an act which creates false or misleadin...,"Regulation 4(2)(a) of PFUTP, inter alia, prohibits a person from indulging i...",1293186097798,"4(2), PFUTP, false, misleading, manipulate",,"{'answer_start': [41], 'text': ['prohibits a person from indulging in an act...",legal case
25,25,What are factors under section 15J of the SEBI Act that would be taken into ...,"(a) the amount of disproportionate gain or unfair advantage, \nwherever quan...","In this regard, the provisions of Section 15J of the SEBI Act and Rule 5 of ...",1608121204158_1,"noticee, adjudication, sec15j, penalty, quantum",,"{'answer_start': [237], 'text': ['(a) the amount of disproportionate gain or...",legal case
26,26,Does the non-compliance of regulation 13(1) of PIT attract monetary penalty ...,he shall be liable to a penalty of one lakh rupees \nfor each day during whi...,"15A. Penalty for failure to furnish, information , return etc. \n \nIf any p...",1293773712071,"non-compliance, penalty, sec15a, PIT, adjudication",,"{'answer_start': [405], 'text': ['he shall be liable to a penalty of one lak...",legal case
27,27,Does a violation of regulation 13 (4) of PIT attract monetary \npenalty unde...,"liable \nfor penalty under sections 15HA and 15A(b) of SEBI Act, 1992",It has been established that the Noticee has violated provisions of section ...,1300170154092,"penalty, violation, PIT, sec15a, adjudication, regulation",,"{'answer_start': [251], 'text': ['liable for penalty under sections 15HA an...",legal case
28,28,Does any violation under regulation 12(1) attract monetary penalty under sec...,Noticee is liable \nfor monetary penalty under section 15HB and section 15G(...,As the violation of the statutory obligation under regulation 12(1) of PIT \...,1300172109593,"penalty, violation, PIT, regulation, adjudication, sec15",,"{'answer_start': [274], 'text': ['Noticee is liable for monetary penalty un...",legal case
34,34,What is the main factor used to determine whether a transaction has been exe...,intention of the parties,Whether a transaction has been executed with the intention to manipulate the...,1287393831571,"manipulate, market, intention, parties",Divya,"{'answer_start': [129], 'text': ['intention of the parties']}",legal case


In [None]:
for t in legalMap["1312182027229.pdf"]:
  print(t)
  print()

- OrderGennexLaboratories[1].doc Adjudication Order In Respect of Gennex Laboratories Ltd and its Directors July 29, 2011 In The Matter of Gennex Laboratories Ltd BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJUDICATION ORDER No PKB AO- 42 2011] UNDER SECTION 15-I OF SECURITIES AND EXCHANGE BOARD OF INDIA ACT, 1992 READ WITH RULE 5 OF SEBI (PROCEDURE FOR HOLDING INQUIRY AND IMPOSING PENALTIES BY ADJUDICATING OFFICER) RULES, 1995 In Respect Of Gennex Laboratories Ltd And its directors: Vinod Baid U.C. Bhandari Y. Ravinder Reddy Kishore Jhunjhunwala BRIEF FACTS OF THE CASE 1. Investigation into the affairs relating to buying and selling or dealing in the shares of Gennex Laboratories Limited (hereinafter referred to as ‘Noticee -1’ or ‘GLL’ or ‘the company’) was made to ascertain whether any provision of the SEBI Act, 1992 and various rules and regulations made there under have been violated. The main focus of the investigation was to ascertain whether there w

### Evaluating retrievers

In [None]:
!unzip /content/drive/MyDrive/CollabData/pipeline.zip

Archive:  /content/drive/MyDrive/CollabData/pipeline.zip
   creating: data/
  inflating: data/misc_DF_new.json   
  inflating: data/legal_filenames_testing.pkl  
  inflating: data/informal_queries.pkl  
  inflating: data/finaltrain.pkl     
  inflating: data/misc_tf_idf.pkl    
  inflating: data/misc_total_vocab_new.pkl  
  inflating: data/case_origreg.pkl   
 extracting: data/misc_tf_idf.json   
   creating: data/QnA_version1/
  inflating: data/QnA_version1/case_origreg.pkl  
  inflating: data/QnA_version1/misc_origreg.pkl  
  inflating: data/cleanedregtopics48.pkl  
  inflating: data/misc_origreg.pkl   
  inflating: data/trainquery.pkl     
  inflating: data/case_total_vocab_new.pkl  
  inflating: data/case_queries.pkl   
  inflating: data/misc_filenames_testing.pkl  
  inflating: data/misc_origreg_new.pkl  
  inflating: data/cleanedregulations48.pkl  
  inflating: data/misc_tf_idf_new.pkl  
  inflating: data/case_origreg_new.pkl  
  inflating: data/informal_sentences.pkl  
  inflati

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
import spacy
from nltk.metrics import edit_distance
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import os, json
import pickle

DATA = "data/"
nlp = spacy.load('en_core_web_sm')
defdict = {}
path_to_json = DATA + 'Definitions/'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
for i in json_files:
    with open(DATA + 'Definitions/'+i) as json_file:
        data = json.load(json_file)
    defdict.update(data)
definitions = list(defdict.keys())
defvalues = list(defdict.values())


STOPWORDS = set(
    stopwords.words('english') +\
    ['mm', 'section', 'subsection', 'schedule', '-PRON-', 'chapter', 'regulation', 'repealed', 'thereto','unpublishe', 'thereunder','guideline', 'reference','onus','make','Page','Securities','Exchange','India'])

with open(DATA + 'glossary.json') as f:
    glossary = json.load(f)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
DATA = "data/"
docs = ["Issue and Listing of Non Convertible Redeemable Preference Shares", "Investment Advisers", "Depositories and Participants", "Mutual Funds", "Employees Service", "Substantial Acquisition of Shares and Takeovers", "Appointment of Administrator and Procedure for Refunding to the Investors", "Prohibition of Fraudulent and Unfair Trade Practices relating to Securities Market", "Know Your Client Regulations", "Prohibition of Insider Trading", "Merchant Bankers", "Issue and Listing  of Securities Debt Instruments and Security Receipts", "Delisting of Equity Shares","Issue of Capital And Disclosure Requirements2", "Foreign Venture Capital Investor", "Procedure for Board Meetings", "Custodian", "Ombudsman", "Investor Protection and Education Fund", "Foreign Portfolio Investors", "Issue of Sweat Equity", "Collective Investment Scheme", "Portfolio Managers", "Research Analysts", "Procedure for Search and Seizure", "Issue of Capital And Disclosure Requirements", "Share Based Employee Benefits", "Debenture Trustees", "Alternative Investment Funds", "Stock Exchanges and Clearing Corporations", "Self Regulatory Organisations", "Settlement Proceedings", "Issues and Listing of Muncipal Debt Securities", "Buy Back Of Securities2","Issue and Listing of Debt Securities", "Infrastructure Investment Trusts", "Stock Brokers", "Listing Obligations and Disclosure Requirements", "Registrars to an Issue and Share Transfer Agents", "Real Estate Investment Trusts", "Intermediaries", "Certification of Associated Persons in the Securities Markets", "Credit Rating Agencies", "Regulatory Fee on Stock Exchanges", "Underwriters", "Buy Back Of Securities", "Bankers to an Issue", "Central Database of Market Participants"]
with open(DATA + 'cleanedregulations48.pkl','rb') as f:
    docregs = pickle.load(f)    

regMap = dict()
for key,val in zip(docs,docregs):
  regMap[key]=val


['Issue and Listing of Non Convertible Redeemable Preference Shares',
 'Investment Advisers',
 'Depositories and Participants',
 'Mutual Funds',
 'Employees Service',
 'Substantial Acquisition of Shares and Takeovers',
 'Appointment of Administrator and Procedure for Refunding to the Investors',
 'Prohibition of Fraudulent and Unfair Trade Practices relating to Securities Market',
 'Know Your Client Regulations',
 'Prohibition of Insider Trading',
 'Merchant Bankers',
 'Issue and Listing  of Securities Debt Instruments and Security Receipts',
 'Delisting of Equity Shares',
 'Issue of Capital And Disclosure Requirements2',
 'Foreign Venture Capital Investor',
 'Procedure for Board Meetings',
 'Custodian',
 'Ombudsman',
 'Investor Protection and Education Fund',
 'Foreign Portfolio Investors',
 'Issue of Sweat Equity',
 'Collective Investment Scheme',
 'Portfolio Managers',
 'Research Analysts',
 'Procedure for Search and Seizure',
 'Issue of Capital And Disclosure Requirements',
 'Share

In [None]:
path_to_json = DATA + 'Definitions/'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
for i in json_files:
    with open(DATA + 'Definitions/'+i) as json_file:
        data = json.load(json_file)

    regMap[i.split(".json")[0]] = [ k+v for k,v in data.items() ] + regMap[i.split(".json")[0]]


In [None]:
docregs = list(regMap.values())

with open(DATA + 'cleanedregulations48.pkl','wb') as f:
    pickle.dump(docregs,f)    


In [None]:
!zip -r pipeline.zip data
!mv pipeline.zip /content/drive/MyDrive/CollabData/

  adding: data/ (stored 0%)
  adding: data/misc_DF_new.json (deflated 83%)
  adding: data/legal_filenames_testing.pkl (deflated 81%)
  adding: data/informal_queries.pkl (deflated 74%)
  adding: data/finaltrain.pkl (deflated 73%)
  adding: data/misc_tf_idf.pkl (deflated 85%)
  adding: data/misc_total_vocab_new.pkl (deflated 60%)
  adding: data/case_origreg.pkl (deflated 75%)
  adding: data/misc_tf_idf.json (stored 0%)
  adding: data/QnA_version1/ (stored 0%)
  adding: data/QnA_version1/case_origreg.pkl (deflated 75%)
  adding: data/QnA_version1/misc_origreg.pkl (deflated 75%)
  adding: data/cleanedregtopics48.pkl (deflated 59%)
  adding: data/misc_origreg.pkl (deflated 75%)
  adding: data/trainquery.pkl (deflated 75%)
  adding: data/case_total_vocab_new.pkl (deflated 56%)
  adding: data/case_queries.pkl (deflated 75%)
  adding: data/misc_filenames_testing.pkl (deflated 52%)
  adding: data/misc_origreg_new.pkl (deflated 75%)
  adding: data/cleanedregulations48.pkl (deflated 75%)
  adding

In [None]:
defdict["designated securities"]
what is designated securities

designated securities means specified securities, non-convertible debt securities, non-convertible redeemable preference shares, perpetual debt instrument, perpetual non-cumulative preference shares, Indian depository receipts, securitised debt instruments, [security receipts,]1 units issued by mutual funds and any other securities as may be specified by the Board 


' means specified securities, non-convertible debt securities, non-convertible redeemable preference shares, perpetual debt instrument, perpetual non-cumulative preference shares, Indian depository receipts, securitised debt instruments, [security receipts,]1 units issued by mutual funds and any other securities as may be specified by the Board ; (i)'

In [None]:
def queryvocab(query):
    question_words = ["What","When","Where","Why","How","Who"]
    REMOVE_WORDS = ['regulations','rules','rule','chapter','section','sub','SEBI','means','shall','Securities','Exchange',
                    'pertaining','India']
    qvocab=[i for i in query.split() if i not in stopwords.words() + REMOVE_WORDS
            + question_words]
            
    return qvocab, " ".join(qvocab)

In [None]:
def querypreprocess(query, qvocab, definitions, finaltopics):        
    qnew = " ".join(qvocab)

    importantwords = [ i for i in qvocab if i in definitions]

    expansionwords = []

    for i in qvocab:
      for j in finaltopics:
        if i in j:
          if i not in importantwords:
            importantwords.append(i)
          else:
              if i in definitions:
                  expansionwords.append(i)
    

    for i in expansionwords:
        if i in definitions:
          k=definitions.index(i)
          s = i
          s = s + defvalues[k]
          qnew = qnew + ' ' + s
        else:
          qnew = qnew + ' ' + i
    query = qnew
    
    sent = nlp(query)
    t=0
    for token in sent:
        if(str(token) in importantwords):
            if(t!=0):
                if(str(sent[t-1]) not in importantwords):
                    importantwords.append(str(sent[t-1]))
            if(token.tag_ == 'VB'):
                importantwords.append(str(token))
        t+=1
    
    return query, qvocab, importantwords, expansionwords

In [None]:
with open(DATA + 'cleanedregtopics48.pkl','rb') as f:
    finaltopics = pickle.load(f)    
#vocab definitions
with open(DATA + 'mainvocab.pkl','rb') as f:
    mainvocab = pickle.load(f) 

with open(DATA + 'vocabdef.pkl','rb') as f:
    vocabdef = pickle.load(f) 


In [None]:
from haystack.pipelines import DocumentSearchPipeline

reg_doc_retrieval = DocumentSearchPipeline(reg_retriever)
legal_doc_retrieval = DocumentSearchPipeline(legal_retriever)
misc_doc_retrieval = DocumentSearchPipeline(misc_retriever)


In [None]:
limit=16

In [None]:
from tqdm import tqdm
data=dict()
data["question"]=list()

data["context"]=list()
data["answer"]=list()


for i in range(1,limit):
  data["rank{}".format(i)]=list()

fr,fl,fm=0,0,0
tr,tl,tm=0,0,0
for i,r in tqdm(df.iterrows()):
    if r["doc_type"]=="regulations":
      res = reg_doc_retrieval.run(    
          query=r["question"]
          , params={"Retriever": {"top_k": limit-1}}
      )  
      tr+=1
    elif r["doc_type"]=="legal case":
      res = legal_doc_retrieval.run(    
          query=r["question"]
          , params={"Retriever": {"top_k": limit-1}}
      )  
      tl+=1
    else:    
      res = misc_doc_retrieval.run(    
          query=r["question"]
          , params={"Retriever": {"top_k": limit-1}}
      )  
      tm+=1

    retrieved_docs = [ d.content for d in res["documents"]]
    flag=True
    for i in range(1,limit):
      data["rank{}".format(i)].append(retrieved_docs[i-1])
      if flag and (r["context"] in retrieved_docs[i-1] or retrieved_docs[i-1] in r["context"]):
        if r["doc_type"]=="regulations":
          fr+=1
        elif r["doc_type"]=="legal case":
          fl+=1
        else:
          fm+=1
        flag=False

    data["question"].append(r["question"])
    data["context"].append(r["context"])
    data["answer"].append(r["Answer Span"])


103it [00:03, 31.99it/s]


In [None]:
fr,fl,fm

(3, 12, 0)

In [None]:
tr,tl,tm

(39, 34, 30)

In [None]:
fr*100/tr,fl*100/tl,fm*100/tm

(7.6923076923076925, 35.294117647058826, 0.0)

In [None]:
pd.DataFrame(data).to_csv("nonpreprocessed.csv",index=False)
pd.DataFrame(data)

Unnamed: 0,question,context,answer,rank1,rank2,rank3,rank4,rank5,rank6,rank7,rank8,rank9,rank10,rank11,rank12,rank13,rank14,rank15
0,Does an innocent recipient of UPSI have any defence under the PIT regulations?,The report (Para 55) suggests “where a person trades on the basis of content...,insider may prove his innocence by demonstrating the inclusive list of circu...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,Filed on : 26.3.2014 Registered on : 03.6.2014 Decided on : 15.12.2017 Durat...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA ADJUD...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA ADJUD...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA ADJUD...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA ADJUD...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA ADJUD...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA ADJUD...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA ADJUD...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA ADJUD...
1,Will a promoter group entity require a pre-clearance from a related company ...,"With respect to the query at 5(i), attention may be drawn to clause 6 of Sch...","only by ""Designated persons"" if the value of the proposed trades is above su...","KIRLOSKAR CHILLERS PRIVATE LIMITED\tEnriching LivesA Ki4,-Loskar ro-wp- Covv...",BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,ADJUDICATION ORDER NO. Order/JS/DJ/2019-20/3484-3488 UNDER SECTION 15-I OF S...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA (ADJU...,ADJUDICATION ORDER NO. Order/JS/DJ/2019-20/3493-3494 UNDER SECTION 15-I OF S...,WTM/SM/IVD/ID2/9713/2020-21 BEFORE THE SECURITIES AND EXCHANGE BOARD OF INDI...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA (ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,` SECURITIES AND EXCHANGE BOARD OF INDIA WTM/MPB/IVD/ ID1/139/2020 ORDER Und...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA ADJUD...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA ADJUD...
2,Is inter-se off-market transfer of shares between insiders within a period o...,"In the instant case, the said promoters have the option to convert warrants ...",promoters have the option to convert warrants any time within 18 months from...,"1.NIMI UPENDRABHAI PATELKrishnarpan"" Samir Estate, Gotri-Sevasi Road, Sevasi...",VITT7V redv&rcil:44.1Securities and Exchange Board of IndiaCHIEF GENERAL MAN...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,STAR CEMENT LIMITED(Formerly CEMENT MANUFACTURING COMPANY LTD)ToCorporate Fi...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,Deputy General ManagerIntegrated Surveillance Department71- zr Tr Fd93fiT\tc...,"KIRLOSKAR CHILLERS PRIVATE LIMITED\tEnriching LivesA Ki4,-Loskar ro-wp- Covv...",BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,` SECURITIES AND EXCHANGE BOARD OF INDIA WTM/MPB/IVD/ ID1/139/2020 ORDER Und...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,"BEFORE THE SECURITIES AND EXCHANGE BOARD OF INDIA CORAM: MADHABI PURI BUCH, ...",BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA ADJUD...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...
3,Can an AIF invest its unutilized funds in liquid mutual funds?,The provisions under Regulation 15(1)(f) is provided in the interest of inve...,may invest investment income or investment proceeds arising from sale or tra...,g1'(alzre1 Tfa-34frrcirviHqSecurities and Exchange Board of IndiaDEPUTY GENE...,29 June 2018The Chief General ManagerInvestment Management DepartmentDivisio...,41471Eeffvfffar1-4- rciPiiq O. aSecurities and Exchange Board of IndiaDEPUTY...,Securities and ExchangeBoard of IndiaDEPUTY GENERAL MANAGERINVESTMENT MANAGE...,CONSULTATIVE PAPER ON GUIDELINES ON OVERSEAS INVESTMENTS AND OTHERISSUES/CLA...,AJM FINANCIAL27 November 2018The Chief General ManagerInvestment Management ...,CONCEPT PAPER ON PROPOSED ALTERNATIVE INVESTMENT FUNDS REGULATION FORPUBLIC ...,Consultation Paper on Crowdfunding in India1.0 Introduction1.1 This consulta...,Report submitted by Alternative Investment Policy Advisory Committee1. To so...,"CHIEF GENERAL MANAGER\t9774-irredyftINTEGRATED SURVEILLANCE DEPARTMENT\ta?)""...",.4\\tSBI Funds Management Private Limited A joint venture between SBI & AMUN...,".v.`""71-TATAcov)26 May 2016P/2016052710000042025The Chief General Manager\tA...","LettsVentureNovember 4th, 2019To,Investment Management Department,Division o...","KellyGammaAdvisors LLP1007/1008, le Floor, Shapath V, Opp. Karnavati Club, N...","STTrft 1,61-1fa.f; 1\telSecurities and ExchangeBoard of IndiaDEPUTY GENERAL ..."
4,What are the penal consequences of not furnishing information asked by SEBI?,The Honorable Securities Appellate Tribunal (hereinafter referred to as “SAT...,attract the penalty prescribed under section 15A of the SEBI Act,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,SECURITIES AND EXCHANGE BOARD OF INDIA [ADJUDICATION ORDER NO. MC/AO- 01/201...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA _____...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OFINDIA ADJUDI...,WTM/SM/IVD/ID2/9711/2020-21 BEFORE THE SECURITIES AND EXCHANGE BOARD OF INDI...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,"3. (1) With effect from such date as the Board may, by an order fix, there s...",BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,3. (1) An application by a debenture trustee for grant of a cert...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,What is InvIT?,InvITs are proposed to be vehicles allowing for adding of projects in future...,vehicles allowing for adding of projects in future in\nthe same vehicle so t...,"Draft SEBI (Infrastructure Investment Trusts) Regulations, 2014SEBI had come...",3. (1) No person shall act as an InvIT unless it has obtained a ...,Consultation paper for amendments to the SEBI (Infrastructure InvestmentTrus...,Consultation paper for guidelines for public issue of units of Infrastructur...,Consultation paper for Disclosure of financial information in offerdocument/...,"LINDIA INFRASTRUCTURE TRUST(Principal place of Business: Unit no.804,A-Wing,...",Consultation PaperContinuous disclosures to be made by Infrastructure Invest...,rcil444t4 albSecurities and Exchange Board of IndiaDEPUTY GENERAL MANAGERDEP...,Consultation paper for amendments to the SEBI (Infrastructure Investment Tru...,IITT*TiTirTiftalfr rcirviti W-13cSecurities and Exchange Board of IndiaYogit...,"Bimal JalanFormer Governor, Reserve Bank of IndiaChairman. Centre for Develo...",Concept paper for issuance of Green Bonds1. BackgroundA green bond is like a...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,National Strategy for Financial Education2National Strategy for Financial Ed...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA ADJUD...
99,"What does the Section 77A of the Companies Act, 1956 entail?","Section 77A of the Companies Act, 1956 contains the basic framework for\ncom...",contains the basic framework for\ncompanies to buy back its own securities,9117rei/ gffry7W3* W'FirerSecurities and ExchangeBoard of IndiaCFD/DCR/TO/DA...,"SPLStwo..'--ketc .77TUPREME PE ROCHEM LIDSolitaire Corporate Park, Buliding ...",SECURITIES AND EXCHANGE BOARD OF INDIA WTM/SM/IVD/ID4/6335/2019-20 ORDER UND...,3. (1) These regulations shall be applicable to buy-back of shares or other ...,"Navigate your nextDate: January 29, 2019Securities and Exchange Board of Ind...",Deputy General Manager\t9lirdhi lairaIntegrated Surveillance Department\tWpm...,"KPIT CumminsInfosysterns LimitedSeptember 7, 2011To,Mr. Sumit AgrawalAssista...",ADJUDICATION ORDER NO. JS/DJ/12-16/2017 UNDER SECTION 15-I OF SECURITIES AND...,"3. (1) No insider shall communicate, provide, or allow access to any unpubli...",44.TrraThEr3T1T rcrldSecurities and ExchangeBoard of IndiaCHIEF GENERAL MANA...,"g10 P-\[S IsALOKALOK INDUSTRIES LIMITEDPeninsula Towers, Peninsula Corporate...",TIDE WATER OIL CO. (INDIA) LTD.Regd. Office: Yule House I 8 Dr. Rajendra Pra...,Public comments on Report of the Takeover Regulations Advisory Committee may...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA ADJUD...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA ADJUD...
100,"What does the Section 77A(4) of the Companies Act, 1956 say?","Section 77A(4) of the Companies Act, 1956 specifies that every buy back shal...",every buy back shall\nbe completed within a period of 12 months.,9117rei/ gffry7W3* W'FirerSecurities and ExchangeBoard of IndiaCFD/DCR/TO/DA...,"SPLStwo..'--ketc .77TUPREME PE ROCHEM LIDSolitaire Corporate Park, Buliding ...",SECURITIES AND EXCHANGE BOARD OF INDIA WTM/SM/IVD/ID4/6335/2019-20 ORDER UND...,3. (1) These regulations shall be applicable to buy-back of shares or other ...,"Navigate your nextDate: January 29, 2019Securities and Exchange Board of Ind...",Deputy General Manager\t9lirdhi lairaIntegrated Surveillance Department\tWpm...,"KPIT CumminsInfosysterns LimitedSeptember 7, 2011To,Mr. Sumit AgrawalAssista...",BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ ADJ...,"raiav Crind-eteckz)R SYSTEMS INTERNATIONAL LIMITEDSEI CMMI Level 5, PCMM Lev...",BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA (ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...
101,"What does the Section 77A(2) of the Companies Act, 1956 say?","Section 77A(2) of the Companies Act,1956 prohibits only back to back\nbuy ba...",prohibits only back to back\nbuy backs through board resolution,9117rei/ gffry7W3* W'FirerSecurities and ExchangeBoard of IndiaCFD/DCR/TO/DA...,"SPLStwo..'--ketc .77TUPREME PE ROCHEM LIDSolitaire Corporate Park, Buliding ...",SECURITIES AND EXCHANGE BOARD OF INDIA WTM/SM/IVD/ID4/6335/2019-20 ORDER UND...,3. (1) These regulations shall be applicable to buy-back of shares or other ...,"Navigate your nextDate: January 29, 2019Securities and Exchange Board of Ind...",Deputy General Manager\t9lirdhi lairaIntegrated Surveillance Department\tWpm...,"KPIT CumminsInfosysterns LimitedSeptember 7, 2011To,Mr. Sumit AgrawalAssista...",BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ ADJ...,"raiav Crind-eteckz)R SYSTEMS INTERNATIONAL LIMITEDSEI CMMI Level 5, PCMM Lev...",BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA (ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...


In [None]:
data=dict()
data["question"]=list()

data["context"]=list()
data["answer"]=list()

for i in range(1,limit):
  data["rank{}".format(i)]=list()

for i,r in tqdm(df.iterrows()):
    _,query = queryvocab(r["question"])
    
    data["question"].append(query)

    if r["doc_type"]=="regulations":
      res = reg_doc_retrieval.run(    
          query=query
          , params={"Retriever": {"top_k": limit-1}}
      )  
    elif r["doc_type"]=="legal case":
      res = legal_doc_retrieval.run(    
          query=query
          , params={"Retriever": {"top_k": limit-1}}
      )  
    else:    
      res = misc_doc_retrieval.run(    
          query=query
          , params={"Retriever": {"top_k": limit-1}}
      )  


    retrieved_docs = [ d.content for d in res["documents"]]
    for i in range(1,limit):
      try:
        data["rank{}".format(i)].append(retrieved_docs[i-1])
      except:
        data["rank{}".format(i)].append(" ")
    data["context"].append(r["context"])
    data["answer"].append(r["Answer Span"])

103it [00:08, 11.67it/s]


In [None]:
pd.DataFrame(data).to_csv("preprocessed.csv",index=False)
pd.DataFrame(data)

Unnamed: 0,question,context,answer,rank1,rank2,rank3,rank4,rank5,rank6,rank7,rank8,rank9,rank10,rank11,rank12,rank13,rank14,rank15
0,Does innocent recipient UPSI defence PIT regulations?,The report (Para 55) suggests “where a person trades on the basis of content...,insider may prove his innocence by demonstrating the inclusive list of circu...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,Filed on : 26.3.2014 Registered on : 03.6.2014 Decided on : 15.12.2017 Durat...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA ADJUD...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA ADJUD...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA ADJUD...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA ADJUD...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA ADJUD...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA ADJUD...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA ADJUD...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA ADJUD...
1,Will promoter group entity require pre-clearance related company even though...,"With respect to the query at 5(i), attention may be drawn to clause 6 of Sch...","only by ""Designated persons"" if the value of the proposed trades is above su...","KIRLOSKAR CHILLERS PRIVATE LIMITED\tEnriching LivesA Ki4,-Loskar ro-wp- Covv...",BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,ADJUDICATION ORDER NO. Order/JS/DJ/2019-20/3484-3488 UNDER SECTION 15-I OF S...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA (ADJU...,ADJUDICATION ORDER NO. Order/JS/DJ/2019-20/3493-3494 UNDER SECTION 15-I OF S...,WTM/SM/IVD/ID2/9713/2020-21 BEFORE THE SECURITIES AND EXCHANGE BOARD OF INDI...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA (ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,` SECURITIES AND EXCHANGE BOARD OF INDIA WTM/MPB/IVD/ ID1/139/2020 ORDER Und...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA ADJUD...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA ADJUD...
2,Is inter-se off-market transfer shares insiders within period 6 months post ...,"In the instant case, the said promoters have the option to convert warrants ...",promoters have the option to convert warrants any time within 18 months from...,"1.NIMI UPENDRABHAI PATELKrishnarpan"" Samir Estate, Gotri-Sevasi Road, Sevasi...",BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,VITT7V redv&rcil:44.1Securities and Exchange Board of IndiaCHIEF GENERAL MAN...,` SECURITIES AND EXCHANGE BOARD OF INDIA WTM/MPB/IVD/ ID1/139/2020 ORDER Und...,STAR CEMENT LIMITED(Formerly CEMENT MANUFACTURING COMPANY LTD)ToCorporate Fi...,"KIRLOSKAR CHILLERS PRIVATE LIMITED\tEnriching LivesA Ki4,-Loskar ro-wp- Covv...","BEFORE THE SECURITIES AND EXCHANGE BOARD OF INDIA CORAM: MADHABI PURI BUCH, ...",BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OFINDIA ADJUDI...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OFINDIA ADJUDI...,"BEFORE THE SECURITIES AND EXCHANGE BOARD OF INDIA, MUMBAI CORAM: S. RAMAN, W...",WTM/MPB/EFD/ 116 /2018 BEFORE THE SECURITIES AND EXCHANGE BOARD OF INDIA COR...,"BEFORE THE SECURITIES AND EXCHANGE BOARD OF INDIA, MUMBAI CORAM: S. RAMAN, W..."
3,Can AIF invest unutilized funds liquid mutual funds?,The provisions under Regulation 15(1)(f) is provided in the interest of inve...,may invest investment income or investment proceeds arising from sale or tra...,g1'(alzre1 Tfa-34frrcirviHqSecurities and Exchange Board of IndiaDEPUTY GENE...,29 June 2018The Chief General ManagerInvestment Management DepartmentDivisio...,41471Eeffvfffar1-4- rciPiiq O. aSecurities and Exchange Board of IndiaDEPUTY...,Securities and ExchangeBoard of IndiaDEPUTY GENERAL MANAGERINVESTMENT MANAGE...,CONSULTATIVE PAPER ON GUIDELINES ON OVERSEAS INVESTMENTS AND OTHERISSUES/CLA...,AJM FINANCIAL27 November 2018The Chief General ManagerInvestment Management ...,CONCEPT PAPER ON PROPOSED ALTERNATIVE INVESTMENT FUNDS REGULATION FORPUBLIC ...,Consultation Paper on Crowdfunding in India1.0 Introduction1.1 This consulta...,Report submitted by Alternative Investment Policy Advisory Committee1. To so...,"CHIEF GENERAL MANAGER\t9774-irredyftINTEGRATED SURVEILLANCE DEPARTMENT\ta?)""...",.4\\tSBI Funds Management Private Limited A joint venture between SBI & AMUN...,".v.`""71-TATAcov)26 May 2016P/2016052710000042025The Chief General Manager\tA...","LettsVentureNovember 4th, 2019To,Investment Management Department,Division o...","KellyGammaAdvisors LLP1007/1008, le Floor, Shapath V, Opp. Karnavati Club, N...",is T1 CF;Kri --UtuAt)Motilal Oswal Real Estate Investment Advisors Pvt. Ltd....
4,penal consequences furnishing information asked SEBI?,The Honorable Securities Appellate Tribunal (hereinafter referred to as “SAT...,attract the penalty prescribed under section 15A of the SEBI Act,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA _____...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,SECURITIES AND EXCHANGE BOARD OF INDIA [ADJUDICATION ORDER NO. MC/AO- 01/201...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OFINDIA ADJUDI...,"3. (1) With effect from such date as the Board may, by an order fix, there s...",BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,3. (1) An application by a debenture trustee for grant of a cert...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,WTM/SM/IVD/ID2/9711/2020-21 BEFORE THE SECURITIES AND EXCHANGE BOARD OF INDI...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,3. (1) An application by scheduled bank for grant of a certificat...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,InvIT?,InvITs are proposed to be vehicles allowing for adding of projects in future...,vehicles allowing for adding of projects in future in\nthe same vehicle so t...,"Draft SEBI (Infrastructure Investment Trusts) Regulations, 2014SEBI had come...",3. (1) No person shall act as an InvIT unless it has obtained a ...,Consultation paper for amendments to the SEBI (Infrastructure InvestmentTrus...,Consultation paper for guidelines for public issue of units of Infrastructur...,Consultation paper for Disclosure of financial information in offerdocument/...,"LINDIA INFRASTRUCTURE TRUST(Principal place of Business: Unit no.804,A-Wing,...",Consultation PaperContinuous disclosures to be made by Infrastructure Invest...,rcil444t4 albSecurities and Exchange Board of IndiaDEPUTY GENERAL MANAGERDEP...,Consultation paper for amendments to the SEBI (Infrastructure Investment Tru...,IITT*TiTirTiftalfr rcirviti W-13cSecurities and Exchange Board of IndiaYogit...,,,,,
99,"Section 77A Companies Act, 1956 entail?","Section 77A of the Companies Act, 1956 contains the basic framework for\ncom...",contains the basic framework for\ncompanies to buy back its own securities,9117rei/ gffry7W3* W'FirerSecurities and ExchangeBoard of IndiaCFD/DCR/TO/DA...,"SPLStwo..'--ketc .77TUPREME PE ROCHEM LIDSolitaire Corporate Park, Buliding ...",SECURITIES AND EXCHANGE BOARD OF INDIA WTM/SM/IVD/ID4/6335/2019-20 ORDER UND...,3. (1) These regulations shall be applicable to buy-back of shares or other ...,"Navigate your nextDate: January 29, 2019Securities and Exchange Board of Ind...","KPIT CumminsInfosysterns LimitedSeptember 7, 2011To,Mr. Sumit AgrawalAssista...",Deputy General Manager\t9lirdhi lairaIntegrated Surveillance Department\tWpm...,"g10 P-\[S IsALOKALOK INDUSTRIES LIMITEDPeninsula Towers, Peninsula Corporate...","3. (1) No insider shall communicate, provide, or allow access to any unpubli...",ADJUDICATION ORDER NO. JS/DJ/12-16/2017 UNDER SECTION 15-I OF SECURITIES AND...,TIDE WATER OIL CO. (INDIA) LTD.Regd. Office: Yule House I 8 Dr. Rajendra Pra...,Public comments on Report of the Takeover Regulations Advisory Committee may...,44.TrraThEr3T1T rcrldSecurities and ExchangeBoard of IndiaCHIEF GENERAL MANA...,47TraWeal&altr\tcaSecurities and Exchange Board of IndiaCFD/PC/IG/CB/SSVS/15...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...
100,"Section 77A(4) Companies Act, 1956 say?","Section 77A(4) of the Companies Act, 1956 specifies that every buy back shal...",every buy back shall\nbe completed within a period of 12 months.,9117rei/ gffry7W3* W'FirerSecurities and ExchangeBoard of IndiaCFD/DCR/TO/DA...,"SPLStwo..'--ketc .77TUPREME PE ROCHEM LIDSolitaire Corporate Park, Buliding ...",SECURITIES AND EXCHANGE BOARD OF INDIA WTM/SM/IVD/ID4/6335/2019-20 ORDER UND...,3. (1) These regulations shall be applicable to buy-back of shares or other ...,"Navigate your nextDate: January 29, 2019Securities and Exchange Board of Ind...","KPIT CumminsInfosysterns LimitedSeptember 7, 2011To,Mr. Sumit AgrawalAssista...",Deputy General Manager\t9lirdhi lairaIntegrated Surveillance Department\tWpm...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA (ADJU...,"raiav Crind-eteckz)R SYSTEMS INTERNATIONAL LIMITEDSEI CMMI Level 5, PCMM Lev...",BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,Public comments on Report of the Takeover Regulations Advisory Committee may...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ ADJ...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...
101,"Section 77A(2) Companies Act, 1956 say?","Section 77A(2) of the Companies Act,1956 prohibits only back to back\nbuy ba...",prohibits only back to back\nbuy backs through board resolution,9117rei/ gffry7W3* W'FirerSecurities and ExchangeBoard of IndiaCFD/DCR/TO/DA...,"SPLStwo..'--ketc .77TUPREME PE ROCHEM LIDSolitaire Corporate Park, Buliding ...",SECURITIES AND EXCHANGE BOARD OF INDIA WTM/SM/IVD/ID4/6335/2019-20 ORDER UND...,3. (1) These regulations shall be applicable to buy-back of shares or other ...,"Navigate your nextDate: January 29, 2019Securities and Exchange Board of Ind...","KPIT CumminsInfosysterns LimitedSeptember 7, 2011To,Mr. Sumit AgrawalAssista...",Deputy General Manager\t9lirdhi lairaIntegrated Surveillance Department\tWpm...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA (ADJU...,"raiav Crind-eteckz)R SYSTEMS INTERNATIONAL LIMITEDSEI CMMI Level 5, PCMM Lev...",BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,Public comments on Report of the Takeover Regulations Advisory Committee may...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ADJU...,BEFORE THE ADJUDICATING OFFICER SECURITIES AND EXCHANGE BOARD OF INDIA [ ADJ...


In [None]:
analysis=dict()
analysis["positive"]=list()
analysis["answer"]=list()
analysis["question"]=list()

for i in range(1,limit):
  analysis["rank{}".format(i)]=list()

In [None]:
found=0
tempdf = pd.DataFrame(data)
for n,r in tempdf.iterrows():
  for i in range(1,limit):
    if r["context"] in r["rank{}".format(i)] or r["rank{}".format(i)] in r["context"]:
      found+=1

found

16

In [None]:
tempdf = pd.DataFrame(data)
for n,r in tempdf.iterrows():
  for i in range(1,limit):
    val=0
    analysis["rank{}".format(i)].append(r["rank{}".format(i)])
    if r["answer"] in r["rank{}".format(i)]:
      val=1
  analysis["positive"].append(val)
  analysis["answer"].append(r["answer"])
  analysis["question"].append(r["question"])


In [None]:
found=0
tempdf = pd.read_csv("nonpreprocessed.csv")
for n,r in tempdf.iterrows():
  for i in range(1,limit):
    if r["context"] in r["rank{}".format(i)] or r["rank{}".format(i)] in r["context"]:
      print("YES: ",n )
      print(i)
      found+=1
      break
  print("="*5)

found

In [None]:


tempdf = pd.read_csv("nonpreprocessed.csv")
# tempdf = pd.DataFrame(data)
for n,r in tempdf.iterrows():
  for i in range(1,limit):
    val=0
    analysis["rank{}".format(i)].append(r["rank{}".format(i)])
    if r["answer"] in r["rank{}".format(i)]:
      val=1
  analysis["positive"].append(val)
  analysis["answer"].append(r["answer"])
  analysis["question"].append(r["question"])


NameError: ignored

In [None]:
analysisdf = pd.DataFrame(analysis)


In [None]:
analysisdf[analysisdf["positive"]==1].head(n=10).to_csv("PositiveRetrievals.csv",index=False)

In [None]:
analysisdf[analysisdf["positive"]==0].head(n=10).to_csv("NegativeRetrievals.csv",index=False)

### FULL PIPELINE

In [None]:
# query
metric.compute(predictions=formatted_predictions, references=references)
# Without query preprocessing : {'exact_match': 0.0, 'f1': 12.927294976475306}
# With query preprocessing : {'exact_match': 0.0, 'f1': 1.0526315789473686}

In [None]:
len(references)

In [None]:
legaldf= df[df["doc_type"]=="legal case"]
formatted_predictions=list()
answeregs=list()
references=list()
itr=0
for i,r in tqdm(legaldf.iterrows()):
    prediction = legal_pipe.run(    
        query=r["question"]
        , params={"Retriever": {"top_k": 15}, "Reader": {"top_k": 1}}
    )  
    references.append({"id": i, "answers": r["answers"]})
    formatted_predictions.append({"id": i, "prediction_text":  prediction["answers"][0].answer})


In [None]:
metric.compute(predictions=formatted_predictions, references=references)

In [None]:
legaldf= df[df["doc_type"]=="misc"]
formatted_predictions=list()
answeregs=list()
references=list()
itr=0
for i,r in tqdm(legaldf.iterrows()):
    prediction = legal_pipe.run(    
        query=r["question"]
        , params={"Retriever": {"top_k": 15}, "Reader": {"top_k": 1}}
    )  
    references.append({"id": i, "answers": r["answers"]})
    formatted_predictions.append({"id": i, "prediction_text":  prediction["answers"][0].answer})


In [None]:
metric.compute(predictions=formatted_predictions, references=references)