## Query Processing Module

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip /content/drive/MyDrive/colabdata/pipeline.zip

Archive:  /content/drive/MyDrive/colabdata/pipeline.zip
  inflating: QnA_Latest.csv          
   creating: data/
  inflating: data/case_tf_idf_new.pkl  
  inflating: data/concept_filenames.pkl  
  inflating: data/glossary.json      
  inflating: data/misc_total_vocab_new.pkl  
  inflating: data/miscfiles_mapper.pkl  
  inflating: data/new_cleanedregulations48.pkl  
  inflating: data/mainvocab.pkl      
  inflating: data/case_origreg.pkl   
  inflating: data/case_filenames.pkl  
  inflating: data/case_DF.json       
  inflating: data/misc_DF.json       
  inflating: data/trainquery.pkl     
  inflating: data/tfidf_regs.pkl     
  inflating: data/case_queries.pkl   
  inflating: data/misc_DF_new.json   
   creating: data/Definitions/
  inflating: data/Definitions/SETTLEMENT-PROCEEDINGS.json  
  inflating: data/Definitions/OMBUDSMAN.json  
  inflating: data/Definitions/INTERMEDIARIES.json  
  inflating: data/Definitions/Investor-Protection-and-Education-Fund.json  
  inflating: data/Defin

#### Query Input

In [None]:
!ls

data  drive  QnA_Latest.csv  sample_data


In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import bs4

%matplotlib inline

DATA="data/"

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
print(tf.__version__)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

2.7.0


In [None]:
from nltk.metrics import edit_distance
#Definitions dictionary
import os, json
import pickle
defdict = {}
path_to_json = DATA + 'Definitions/'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
for i in json_files:
    with open(DATA + 'Definitions/'+i) as json_file:
        data = json.load(json_file)
    defdict.update(data)
definitions = list(defdict.keys())
defvalues = list(defdict.values())

docs = ["Issue and Listing of Non Convertible Redeemable Preference Shares", "Investment Advisers", "Depositories and Participants", "Mutual Funds", "Employees Service", "Substantial Acquisition of Shares and Takeovers", "Appointment of Administrator and Procedure for Refunding to the Investors", "Prohibition of Fraudulent and Unfair Trade Practices relating to Securities Market", "Know Your Client Regulations", "Prohibition of Insider Trading", "Merchant Bankers", "Issue and Listing  of Securities Debt Instruments and Security Receipts", "Delisting of Equity Shares","Issue of Capital And Disclosure Requirements2", "Foreign Venture Capital Investor", "Procedure for Board Meetings", "Custodian", "Ombudsman", "Investor Protection and Education Fund", "Foreign Portfolio Investors", "Issue of Sweat Equity", "Collective Investment Scheme", "Portfolio Managers", "Research Analysts", "Procedure for Search and Seizure", "Issue of Capital And Disclosure Requirements", "Share Based Employee Benefits", "Debenture Trustees", "Alternative Investment Funds", "Stock Exchanges and Clearing Corporations", "Self Regulatory Organisations", "Settlement Proceedings", "Issues and Listing of Muncipal Debt Securities", "Buy Back Of Securities2","Issue and Listing of Debt Securities", "Infrastructure Investment Trusts", "Stock Brokers", "Listing Obligations and Disclosure Requirements", "Registrars to an Issue and Share Transfer Agents", "Real Estate Investment Trusts", "Intermediaries", "Certification of Associated Persons in the Securities Markets", "Credit Rating Agencies", "Regulatory Fee on Stock Exchanges", "Underwriters", "Buy Back Of Securities", "Bankers to an Issue", "Central Database of Market Participants"]

#Regulations
with open(DATA + 'cleanedregulations48.pkl','rb') as f:
    docregs = pickle.load(f)    

#Topics documentwise
with open(DATA + 'cleanedregtopics48.pkl','rb') as f:
    finaltopics = pickle.load(f)    

#vocab definitions
with open(DATA + 'mainvocab.pkl','rb') as f:
    mainvocab = pickle.load(f) 

with open(DATA + 'vocabdef.pkl','rb') as f:
    vocabdef = pickle.load(f) 

#Additional Documents

with open('data/misc_filenames_testing.pkl','rb') as filer:
    miscf = pickle.load(filer)

    
#Legal Case files
with open('data/legal_filenames_testing.pkl','rb') as filer:
    legalf = pickle.load(filer)

import spacy
nlp = spacy.load('en_core_web_sm')

from nltk.corpus import wordnet

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import math
import numpy as np
from collections import Counter
from nltk.corpus import stopwords

STOPWORDS = set(
    stopwords.words('english') +\
    ['mm', 'section', 'subsection', 'schedule', '-PRON-', 'chapter', 'regulation', 'repealed', 'thereto','unpublishe', 'thereunder','guideline', 'reference','onus','make','Page','Securities','Exchange','India'])

with open(DATA + 'glossary.json') as f:
    glossary = json.load(f)

In [None]:
def queryvocab(query):
    question_words = ["What","When","Where","Why","How","Who"]
    REMOVE_WORDS = ['regulations','rules','rule','chapter','section','sub','SEBI','means','shall','Securities','Exchange',
                    'pertaining','India']
    qvocab=[i for i in query.split() if i not in stopwords.words() + REMOVE_WORDS
            + question_words]
            
    return qvocab, " ".join(qvocab)

In [None]:
def querypreprocess(query, qvocab, definitions, finaltopics):        
    qnew = " ".join(qvocab)

    importantwords = [ i for i in qvocab if i in definitions]

    expansionwords = []

    for i in qvocab:
      for j in finaltopics:
        if i in j:
          if i not in importantwords:
            importantwords.append(i)
          else:
              if i in definitions:
                  expansionwords.append(i)
    

    for i in expansionwords:
        if i in definitions:
          k=definitions.index(i)
          s = i
          s = s + defvalues[k]
          qnew = qnew + ' ' + s
        else:
          qnew = qnew + ' ' + i
    query = qnew
    
    sent = nlp(query)
    t=0
    for token in sent:
        if(str(token) in importantwords):
            if(t!=0):
                if(str(sent[t-1]) not in importantwords):
                    importantwords.append(str(sent[t-1]))
            if(token.tag_ == 'VB'):
                importantwords.append(str(token))
        t+=1
    
    return query, qvocab, importantwords, expansionwords

In [None]:
def regextract(docinput, docs, docregs, mainvocab, vocabdef, query, glossary):
    
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    v = []
    
    if(docinput == 'All'):
        rdocs = set()
        for i in glossary.keys():
            if(i in query.lower()):
                dval = glossary[i]
                rdocs.update(dval)

        if(len(rdocs)>0):
            docinput = [ docs[i] for i in rdocs]
        else:
            docinput = ['All']
            
    else:        
        docinput = [docinput]
    
    return docinput

## Regulations Retrieval Module

In [None]:
def convert_lower_case(data):
    return np.char.lower(data)

def remove_stop_words(data):
    stop_words = STOPWORDS
    words = word_tokenize(str(data))
    new_text = [ w for w in words if w not in stop_words and len(w) > 1]
    return " ".join(new_text)

def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = [ stemmer.stem(w) for w in tokens ]
    new_text = " ".join(new_text)
    return new_text

In [None]:
def preprocessing(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data)
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = stemming(data)
    return data

In [None]:
def tfidfreg(docregs, docinput, category):
    
    if(category == 'regulations'):
        d = [ docregs[docs.index(i)] for i in docinput if i in docs] 
        if(len(d)>0):
            docregs = d

    origreg = []
    reverseMap = dict()
    for ind,i in enumerate(docregs):
        for j in i:          
            reverseMap[j]=docs[ind]
            data = preprocessing(j)
            if data != '' and data !='\xa0\n':
              origreg.append(j)

    return origreg,reverseMap        


In [None]:
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

In [None]:
!pip3 install sentence_transformers transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 3.7 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 18.7 MB/s 
[?25hCollecting tokenizers>=0.10.3
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 18.5 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 39.5 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 511 kB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.many

In [None]:
# from sentence_transformers import SentenceTransformer
import tensorflow as tf
import tensorflow_hub as hub

def cosine_similarity_transformer(k, query, qvocab, origreg, importantwords,reverseMap):
    
#     find between query and each origreg 

    d_cosines = []
    module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
    model = hub.load(module_url)
    def embed(input):
      return model(input)

    query_vector = embed([query])
    seg = int(len(origreg)/3)
    reg_vectors= embed(origreg[:seg])
    reg_vectors_2 = embed(origreg[seg:seg*2])
    reg_vectors_3 = embed(origreg[seg*2:])
    reg_vectors = tf.concat(axis=0, values = [reg_vectors, reg_vectors_2,reg_vectors_3])
    d_cosines=list()
    for r in reg_vectors.numpy():
        d_cosines.append(cosine_sim(query_vector.numpy()[0], r))

    out = np.array(d_cosines).argsort()[::-1]
    answers = [ origreg[i] for i in out]
    indexs=[]

    answers.sort(reverse=True)
    distances=list()
    for ele in answers:
        distances.append(ele[0])
        if ele[1] in reverseMap.keys():
            indexs.append(reverseMap[ele[1]])
        else:
            indexs.append(None)
    
    if(k<len(answers)):
        return answers[0:k],indexs[:k],distances[:k]
    else:
        return answers,indexs,distances




In [None]:
def vectormodel(query, k, qvocab, origreg,importantwords,reverseMap):
  A,indexs,distances = cosine_similarity_transformer(k, query, qvocab, origreg, importantwords,reverseMap)
  return A,indexs,distances

In [None]:
from transformers import pipeline
import json

def answermod(A,queryinput, model_name):
    model = pipeline(model=model_name, tokenizer=model_name, task="question-answering")
    answers=list()
    scores=list()
    regs = A
    for r in regs:
        try:
            result=model(question = queryinput, context= r)
            answers.append(result["answer"])
            scores.append(result["score"])
        except:
            pass
#             print(r)
#             print("=================")
    out = np.array(scores).argsort()[::-1]
    return answers[out[0]], regs[out[0]] , scores[out[0]] # Returns the high score answer,context and the score

In [None]:
def QnAmodel(queryinput, docinput,k,category, model_name,print_output = False):

    if print_output:
        print('Preprocessing query......')
    qvocab,query = queryvocab(queryinput)
    if(category == 'regulations'):
        docinput = regextract(docinput, docs, docregs, mainvocab, vocabdef, queryinput, glossary)
        query, qvocab, importantwords, expansionwords = querypreprocess(query, qvocab, definitions, finaltopics)
        
        if print_output:
            print('tf-idf calculation in progress.....')
        origreg,reverseMap = tfidfreg(docregs,docinput,category)
        
        if print_output:
            print('Extracting relevant answer regulations.......')
        A,indexs,distances = vectormodel(query, k, qvocab, origreg,importantwords,reverseMap)
        Aold = A

    elif(category == 'misc'):        
        N = 440 

        with open('data/misc_filenames_testing.pkl','rb') as filer:
            lcfiles = pickle.load(filer)

        with open( 'data/misc_origreg_testing.pkl','rb') as filer:
            origreg = pickle.load(filer)

        assert len(origreg) == len(lcfiles)
        reverseMap = dict()
        for key,val in zip(origreg,lcfiles):
          reverseMap[key]=val

        query, qvocab, importantwords, expansionwords = querypreprocess(queryinput, qvocab, definitions, finaltopics)
        if print_output:
            print('Extracting relevant answer regulations.......')
        

        A,indexs,distances = vectormodel(query=query, k=k, qvocab=qvocab, origreg=origreg,importantwords=importantwords,reverseMap=reverseMap)

    elif(category == 'legal case'):
        N = 1496 
        
        with open(DATA + 'casefiles_sentences_new.pkl','rb') as filer:
            case_sentences = pickle.load(filer)

        with open('data/legal_filenames_testing.pkl','rb') as filer:
            lcfiles = pickle.load(filer)

        with open( 'data/legal_origreg_testing.pkl','rb') as filer:
            origreg = pickle.load(filer)

        assert len(origreg) == len(lcfiles)
        reverseMap = dict()
        for key,val in zip(origreg,lcfiles):
          reverseMap[key]=val

        query, qvocab, importantwords, expansionwords = querypreprocess(queryinput, qvocab, definitions, finaltopics)
        if print_output:
            print('Extracting relevant answer regulations.......')

        A,indexs,distances = vectormodel(query=query, k=k, qvocab=qvocab, origreg=origreg,importantwords=importantwords,reverseMap=reverseMap)
        
    else:
        return None
        
    if print_output:
        print('Extracting answer')
    answer, ansreg ,confidence_score= answermod(A, queryinput,model_name)
    
    return answer, ansreg


In [None]:

import time

# queryinput = "Who is an acquirer under SAST?"
# docinput = 'All'
# category = 'legal case'

# queryinput = "How much amount will be provided as reward for informants in insider trading cases"
# docinput = 'All'
# category = 'regulations'

queryinput = "Can an investment manager manage multiple InvIT?"
docinput = 'All'
category = 'misc'

model_name = "deepset/roberta-base-squad2"

k=15
begin = time.time()
answer,ansreg = QnAmodel(queryinput,docinput,k, category,model_name)

end=time.time()

INFO:absl:Using /tmp/tfhub_modules to cache modules.
  return array(a, dtype, copy=False, order=order)


In [None]:
(end-begin)/60

1.0429712375005087

In [None]:
print(queryinput)
print()
print(answer)
print()
print(ansreg)

Can an investment manager manage multiple InvIT?

GIL cannot fix tenure of OCDs beyond 18 months

your letter under reference and without necessarily agreeing with your analysis, our views on the queries raised by you are as under:5. Response to Query no. (i):i. Regulation 70(1) of Chapter vii on Preferential Issue, of SEBI (ICDR) Regulations,2009 clearly states that "the provisions of this Chapter shall not apply where the preferential issue of equity shares is made:(a) Pursuant to conversion of loan or option attached to convertible debt instruments in terms of sub-section (3) and (4) of section 81 of Companies Act, 1956 or sub-section (3) and (4) of section 62 of the Companies Act, 2013, whichever applicable;(b)(c) ...."ii. Regulation 70(1) clearly specifies that it shall not apply where preferential Issueof equity shares is made pursuant to option attached to convertible debtinstruments sub-section (3) and (4) of section 62 of the Companies Act, 2013.JContinuationwral-ErNAT rcirril

In [None]:
queryinput = "Who is an acquirer under SAST?"
docinput = 'All'
category = 'legal case'

# queryinput = "Can an investment manager manage multiple InvIT?"
# docinput = 'All'
# category = 'misc'

model_name = "deepset/roberta-base-squad2"

k=15
begin = time.time()
answer,ansreg = QnAmodel(queryinput,docinput,k, category,model_name,print_output=True)

end=time.time()
(end-begin)/60

Preprocessing query......
Extracting relevant answer regulations.......
Extracting answer


  return array(a, dtype, copy=False, order=order)


1.3134735822677612

In [None]:
print(queryinput)
print()
print(answer)
print()
print(ansreg)

Who is an acquirer under SAST?

Omesh Sethi

 Being a Non-Executive Independent Director of RLL, noticee’s husband would have been aware of only such matters (including those related to subsidiaries of RLL) that were brought before the Board of RLL (including any Committee(s) of the Board of RLL of which noticee was a member) or discussed there. The matter of purchase of OCPL shares by Solrex was informed to the Board of RLL for the first time at its meeting held on April 22, 2008. Hence, in the absence of any evidence to the contrary, it cannot be alleged that the noticee’s husband was aware of the decision of Solrex to buy shares of OCPL prior to April 22, 2008.  It also appears to be SEBI’s case that the funding of Solrex for the purchase of OCPL shares was done by RLL pursuant to the meeting of the Board of Directors of RLL on March 28, 2008. However, in reality, the discussions at the RLL Board meeting on March 28, 2008 in this regard were limited to authorizing Mr. Malvinder Si

In [None]:
with open(DATA + 'misc_origreg_new.pkl', 'rb') as f:
    origreg = pickle.load(f)

In [None]:
with open(DATA + 'concept_text.pkl','rb') as f:
    origreg = pickle.load(f) 


In [None]:
query_vector = embed([queryinput])
t=0
d_cosines=list()
for r in reg_vectors.numpy():
    d_cosines.append(cosine_sim(query_vector.numpy()[0], r))
    t=t+1
    
out = np.array(d_cosines).argsort()[::-1]


NameError: ignored

In [None]:
reg_vectors.numpy()[0]

In [None]:
def avg_length(collect):
  total = 0 
  for c in collect:
    total+=len(c.split())
  return total/len(collect)

In [None]:
type(origreg[0])

In [None]:
# print("The average length of for misc rules is")
# print(avg_length(rules))
print("The average length of for misc origreg is")
print(avg_length(origreg))

In [None]:
# rules = ltext
with open(DATA + 'case_origreg_new.pkl', 'rb') as f:
    origreg = pickle.load(f)
# len(rules),len(ltext)

In [None]:
# print("The average length of for casefiles rules is")
# print(avg_length(rules))
print("The average length of for casefiles origreg is")
print(avg_length(origreg))

In [None]:
len(lfile)

### Data loading

In [None]:
df = pd.read_csv("QnA_Latest.csv")
df.index.name="id"
df

Unnamed: 0_level_0,question,Answer Span,context,Document,Keywords,Start index,Annotator
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Does an innocent recipient of UPSI have any de...,insider may prove his innocence by demonstrati...,The report (Para 55) suggests “where a person ...,1585217059979,"innocent recipient, UPSI, defence",1054.0,
1,Will a promoter group entity require a pre-cle...,"only by ""Designated persons"" if the value of t...","With respect to the query at 5(i), attention m...",0c9f4131313e54e49d7b9915993ab00e,"pre-clearance, compliance officer, designated ...",678.0,
2,Is inter-se off-market transfer of shares betw...,promoters have the option to convert warrants ...,"In the instant case, the said promoters have t...",0e8e621eb1aee80e102b4186ac874ed5,"inter-se off market transfer, block deal, cont...",0.0,
3,Can an AIF invest its unutilized funds in liqu...,may invest investment income or investment pro...,The provisions under Regulation 15(1)(f) is pr...,1e4e946eca55a80467aad2e1546dd639,"unutilized funds, Alternate Investment Funds, ...",257.0,
4,What are the penal consequences of not furnish...,attract the penalty prescribed under section 1...,The Honorable Securities Appellate Tribunal (h...,1289453383303,"Noticee, Summons, failure to Comply with summo...",1396.0,
...,...,...,...,...,...,...,...
103,What is InvIT?,vehicles allowing for adding of projects in fu...,InvITs are proposed to be vehicles allowing fo...,1387543144855,"regulations, guidelines, authority, markets",,Raj
104,What does the Section 77A of the Companies Act...,contains the basic framework for\ncompanies to...,"Section 77A of the Companies Act, 1956 contain...",1357124740967,"regulations, guidelines, authority",,Raj
105,What does the Section 77A(4) of the Companies ...,every buy back shall\nbe completed within a pe...,"Section 77A(4) of the Companies Act, 1956 spec...",1357124740967,"regulations, guidelines, authority, act",,Raj
106,What does the Section 77A(2) of the Companies ...,prohibits only back to back\nbuy backs through...,"Section 77A(2) of the Companies Act,1956 prohi...",1357124740967,"regulations, guidelines, authority, act",,Raj


In [None]:
def start_index(span,context):
  return context.find(span)
      
df["Start index"] = df.apply(lambda x: start_index(x["Answer Span"],x["context"]),axis=1)
df[["Answer Span","context","Start index"]]

Unnamed: 0_level_0,Answer Span,context,Start index
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,insider may prove his innocence by demonstrati...,The report (Para 55) suggests “where a person ...,844
1,"only by ""Designated persons"" if the value of t...","With respect to the query at 5(i), attention m...",741
2,promoters have the option to convert warrants ...,"In the instant case, the said promoters have t...",30
3,may invest investment income or investment pro...,The provisions under Regulation 15(1)(f) is pr...,278
4,attract the penalty prescribed under section 1...,The Honorable Securities Appellate Tribunal (h...,248
...,...,...,...
103,vehicles allowing for adding of projects in fu...,InvITs are proposed to be vehicles allowing fo...,26
104,contains the basic framework for\ncompanies to...,"Section 77A of the Companies Act, 1956 contain...",39
105,every buy back shall\nbe completed within a pe...,"Section 77A(4) of the Companies Act, 1956 spec...",57
106,prohibits only back to back\nbuy backs through...,"Section 77A(2) of the Companies Act,1956 prohi...",41


In [None]:
df[df["Start index"]==-1]

Unnamed: 0_level_0,question,Answer Span,context,Document,Keywords,Start index,Annotator
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
80,What is the capital adequacy requirement for a...,not less than five crore reupees,The capital adequacy requirement referred to i...,Merchant Bankers,,-1,Vrinda
86,What is corporate governance?,the acceptance by management of the inalienabl...,Corporate governance is the acceptance by mana...,1292902977051,"economics, finance, markets",-1,Raj
87,What is Crowdfunding?,solicitation of funds (small amount) from mult...,Crowdfunding is solicitation of funds (small a...,1403005615257,"economics, finance, markets",-1,Raj
88,What is a green bond?,A green bond is like any other bond where a de...,A green bond is like any other bond where a de...,1449143298693,"economics, finance, markets",-1,Raj
95,How many stock exchanges in India are corporat...,18 recognised stock exchanges in India are cor...,"At present, 18 recognised stock exchanges in I...",1293515802514,"regulations, guidelines, authority, facts",-1,Raj


In [None]:
df = df[df["Start index"]!=-1]
df = df.reset_index()
df

Unnamed: 0,id,question,Answer Span,context,Document,Keywords,Start index,Annotator
0,0,Does an innocent recipient of UPSI have any de...,insider may prove his innocence by demonstrati...,The report (Para 55) suggests “where a person ...,1585217059979,"innocent recipient, UPSI, defence",844,
1,1,Will a promoter group entity require a pre-cle...,"only by ""Designated persons"" if the value of t...","With respect to the query at 5(i), attention m...",0c9f4131313e54e49d7b9915993ab00e,"pre-clearance, compliance officer, designated ...",741,
2,2,Is inter-se off-market transfer of shares betw...,promoters have the option to convert warrants ...,"In the instant case, the said promoters have t...",0e8e621eb1aee80e102b4186ac874ed5,"inter-se off market transfer, block deal, cont...",30,
3,3,Can an AIF invest its unutilized funds in liqu...,may invest investment income or investment pro...,The provisions under Regulation 15(1)(f) is pr...,1e4e946eca55a80467aad2e1546dd639,"unutilized funds, Alternate Investment Funds, ...",278,
4,4,What are the penal consequences of not furnish...,attract the penalty prescribed under section 1...,The Honorable Securities Appellate Tribunal (h...,1289453383303,"Noticee, Summons, failure to Comply with summo...",248,
...,...,...,...,...,...,...,...,...
98,103,What is InvIT?,vehicles allowing for adding of projects in fu...,InvITs are proposed to be vehicles allowing fo...,1387543144855,"regulations, guidelines, authority, markets",26,Raj
99,104,What does the Section 77A of the Companies Act...,contains the basic framework for\ncompanies to...,"Section 77A of the Companies Act, 1956 contain...",1357124740967,"regulations, guidelines, authority",39,Raj
100,105,What does the Section 77A(4) of the Companies ...,every buy back shall\nbe completed within a pe...,"Section 77A(4) of the Companies Act, 1956 spec...",1357124740967,"regulations, guidelines, authority, act",57,Raj
101,106,What does the Section 77A(2) of the Companies ...,prohibits only back to back\nbuy backs through...,"Section 77A(2) of the Companies Act,1956 prohi...",1357124740967,"regulations, guidelines, authority, act",41,Raj


In [None]:
def preprocess(t,s):
  return {"answer_start":[s],"text":[t]}

df["answers"] = df.apply(lambda x: preprocess(x["Answer Span"],x["Start index"]),axis=1)
df

Unnamed: 0,id,question,Answer Span,context,Document,Keywords,Start index,Annotator,answers
0,0,Does an innocent recipient of UPSI have any de...,insider may prove his innocence by demonstrati...,The report (Para 55) suggests “where a person ...,1585217059979,"innocent recipient, UPSI, defence",844,,"{'answer_start': [844], 'text': ['insider may ..."
1,1,Will a promoter group entity require a pre-cle...,"only by ""Designated persons"" if the value of t...","With respect to the query at 5(i), attention m...",0c9f4131313e54e49d7b9915993ab00e,"pre-clearance, compliance officer, designated ...",741,,"{'answer_start': [741], 'text': ['only by ""Des..."
2,2,Is inter-se off-market transfer of shares betw...,promoters have the option to convert warrants ...,"In the instant case, the said promoters have t...",0e8e621eb1aee80e102b4186ac874ed5,"inter-se off market transfer, block deal, cont...",30,,"{'answer_start': [30], 'text': ['promoters hav..."
3,3,Can an AIF invest its unutilized funds in liqu...,may invest investment income or investment pro...,The provisions under Regulation 15(1)(f) is pr...,1e4e946eca55a80467aad2e1546dd639,"unutilized funds, Alternate Investment Funds, ...",278,,"{'answer_start': [278], 'text': ['may invest i..."
4,4,What are the penal consequences of not furnish...,attract the penalty prescribed under section 1...,The Honorable Securities Appellate Tribunal (h...,1289453383303,"Noticee, Summons, failure to Comply with summo...",248,,"{'answer_start': [248], 'text': ['attract the ..."
...,...,...,...,...,...,...,...,...,...
98,103,What is InvIT?,vehicles allowing for adding of projects in fu...,InvITs are proposed to be vehicles allowing fo...,1387543144855,"regulations, guidelines, authority, markets",26,Raj,"{'answer_start': [26], 'text': ['vehicles allo..."
99,104,What does the Section 77A of the Companies Act...,contains the basic framework for\ncompanies to...,"Section 77A of the Companies Act, 1956 contain...",1357124740967,"regulations, guidelines, authority",39,Raj,"{'answer_start': [39], 'text': ['contains the ..."
100,105,What does the Section 77A(4) of the Companies ...,every buy back shall\nbe completed within a pe...,"Section 77A(4) of the Companies Act, 1956 spec...",1357124740967,"regulations, guidelines, authority, act",57,Raj,"{'answer_start': [57], 'text': ['every buy bac..."
101,106,What does the Section 77A(2) of the Companies ...,prohibits only back to back\nbuy backs through...,"Section 77A(2) of the Companies Act,1956 prohi...",1357124740967,"regulations, guidelines, authority, act",41,Raj,"{'answer_start': [41], 'text': ['prohibits onl..."


In [None]:
df=df.drop(["Start index"],axis=1)
df

Unnamed: 0,id,question,Answer Span,context,Document,Keywords,Annotator,answers
0,0,Does an innocent recipient of UPSI have any de...,insider may prove his innocence by demonstrati...,The report (Para 55) suggests “where a person ...,1585217059979,"innocent recipient, UPSI, defence",,"{'answer_start': [844], 'text': ['insider may ..."
1,1,Will a promoter group entity require a pre-cle...,"only by ""Designated persons"" if the value of t...","With respect to the query at 5(i), attention m...",0c9f4131313e54e49d7b9915993ab00e,"pre-clearance, compliance officer, designated ...",,"{'answer_start': [741], 'text': ['only by ""Des..."
2,2,Is inter-se off-market transfer of shares betw...,promoters have the option to convert warrants ...,"In the instant case, the said promoters have t...",0e8e621eb1aee80e102b4186ac874ed5,"inter-se off market transfer, block deal, cont...",,"{'answer_start': [30], 'text': ['promoters hav..."
3,3,Can an AIF invest its unutilized funds in liqu...,may invest investment income or investment pro...,The provisions under Regulation 15(1)(f) is pr...,1e4e946eca55a80467aad2e1546dd639,"unutilized funds, Alternate Investment Funds, ...",,"{'answer_start': [278], 'text': ['may invest i..."
4,4,What are the penal consequences of not furnish...,attract the penalty prescribed under section 1...,The Honorable Securities Appellate Tribunal (h...,1289453383303,"Noticee, Summons, failure to Comply with summo...",,"{'answer_start': [248], 'text': ['attract the ..."
...,...,...,...,...,...,...,...,...
98,103,What is InvIT?,vehicles allowing for adding of projects in fu...,InvITs are proposed to be vehicles allowing fo...,1387543144855,"regulations, guidelines, authority, markets",Raj,"{'answer_start': [26], 'text': ['vehicles allo..."
99,104,What does the Section 77A of the Companies Act...,contains the basic framework for\ncompanies to...,"Section 77A of the Companies Act, 1956 contain...",1357124740967,"regulations, guidelines, authority",Raj,"{'answer_start': [39], 'text': ['contains the ..."
100,105,What does the Section 77A(4) of the Companies ...,every buy back shall\nbe completed within a pe...,"Section 77A(4) of the Companies Act, 1956 spec...",1357124740967,"regulations, guidelines, authority, act",Raj,"{'answer_start': [57], 'text': ['every buy bac..."
101,106,What does the Section 77A(2) of the Companies ...,prohibits only back to back\nbuy backs through...,"Section 77A(2) of the Companies Act,1956 prohi...",1357124740967,"regulations, guidelines, authority, act",Raj,"{'answer_start': [41], 'text': ['prohibits onl..."


In [None]:
df.to_csv("tempQnA.csv")

In [None]:
df

Unnamed: 0,id,question,Answer Span,context,Document,Keywords,Annotator,answers
0,0,Does an innocent recipient of UPSI have any de...,insider may prove his innocence by demonstrati...,The report (Para 55) suggests “where a person ...,1585217059979,"innocent recipient, UPSI, defence",,"{'answer_start': [844], 'text': ['insider may ..."
1,1,Will a promoter group entity require a pre-cle...,"only by ""Designated persons"" if the value of t...","With respect to the query at 5(i), attention m...",0c9f4131313e54e49d7b9915993ab00e,"pre-clearance, compliance officer, designated ...",,"{'answer_start': [741], 'text': ['only by ""Des..."
2,2,Is inter-se off-market transfer of shares betw...,promoters have the option to convert warrants ...,"In the instant case, the said promoters have t...",0e8e621eb1aee80e102b4186ac874ed5,"inter-se off market transfer, block deal, cont...",,"{'answer_start': [30], 'text': ['promoters hav..."
3,3,Can an AIF invest its unutilized funds in liqu...,may invest investment income or investment pro...,The provisions under Regulation 15(1)(f) is pr...,1e4e946eca55a80467aad2e1546dd639,"unutilized funds, Alternate Investment Funds, ...",,"{'answer_start': [278], 'text': ['may invest i..."
4,4,What are the penal consequences of not furnish...,attract the penalty prescribed under section 1...,The Honorable Securities Appellate Tribunal (h...,1289453383303,"Noticee, Summons, failure to Comply with summo...",,"{'answer_start': [248], 'text': ['attract the ..."
...,...,...,...,...,...,...,...,...
98,103,What is InvIT?,vehicles allowing for adding of projects in fu...,InvITs are proposed to be vehicles allowing fo...,1387543144855,"regulations, guidelines, authority, markets",Raj,"{'answer_start': [26], 'text': ['vehicles allo..."
99,104,What does the Section 77A of the Companies Act...,contains the basic framework for\ncompanies to...,"Section 77A of the Companies Act, 1956 contain...",1357124740967,"regulations, guidelines, authority",Raj,"{'answer_start': [39], 'text': ['contains the ..."
100,105,What does the Section 77A(4) of the Companies ...,every buy back shall\nbe completed within a pe...,"Section 77A(4) of the Companies Act, 1956 spec...",1357124740967,"regulations, guidelines, authority, act",Raj,"{'answer_start': [57], 'text': ['every buy bac..."
101,106,What does the Section 77A(2) of the Companies ...,prohibits only back to back\nbuy backs through...,"Section 77A(2) of the Companies Act,1956 prohi...",1357124740967,"regulations, guidelines, authority, act",Raj,"{'answer_start': [41], 'text': ['prohibits onl..."


In [None]:
evalset=dict()
evalset["data"]=list()

for i, r in df.iterrows():
  temp=dict()
  temp["title"]=r["Document"]
  temp["paragraphs"]=list()
  
  temp2 = dict()

  temp2["context"]=r["context"]
  temp2["qas"] = list()

  temp3 = dict()

  temp3["id"]=r["id"]
  temp3["is_impossible"]=False
  temp3["question"]=r["question"]
  temp3["answers"]=list()

  temp3["answers"].append({"answer_start": r["answers"]["answer_start"][0], "text":r["answers"]["text"][0]})
  temp3["answers"].append({"answer_start": r["answers"]["answer_start"][0], "text":r["answers"]["text"][0]})
  temp3["answers"].append({"answer_start": r["answers"]["answer_start"][0], "text":r["answers"]["text"][0]})
  temp3["answers"].append({"answer_start": r["answers"]["answer_start"][0], "text":r["answers"]["text"][0]})

  temp2["qas"].append(temp3)
  temp["paragraphs"].append(temp2)

  evalset["data"].append(temp)

In [None]:
evalset["data"][0]["paragraphs"][0]["qas"]

[{'answers': [{'answer_start': 844,
    'text': 'insider may prove his innocence by demonstrating the inclusive list of circumstances provided in the regulations'}],
  'id': 0,
  'is_impossible': False,
  'question': 'Does an innocent recipient of UPSI have any defence under the PIT regulations?'}]

In [None]:
with open('evalsetnew.pkl',"wb") as f:
  pickle.dump(evalset,f)

In [None]:
import json

with open("evalset.json","w") as f:
  json.dump(evalset,f)

In [None]:
type(evalset)

dict

### Custom Predictions

In [None]:
from tqdm import tqdm

def fileType(name):
    if name+".pdf" in legalf:
        return "legal case"
    if name in docs:
        return "regulations"
    return "misc"

docinput = 'All'
k=15

In [None]:
df["doc_type"]=df["Document"].apply(fileType)
len(df[df["doc_type"]=="regulations"]),len(df[df["doc_type"]=="misc"]),len(df[df["doc_type"]=="legal case"])

(18, 51, 34)

In [None]:
# model_name = "deepset/roberta-base-squad2"
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"


In [None]:
tempdf= df[df["doc_type"]=="regulations"]
formatted_predictions=list()
answeregs=list()
references=list()
itr=0
for i,r in tqdm(tempdf.iterrows()):
    docinput="All"    
    category = r["doc_type"]
    queryinput=r["question"]
    answer, ansreg = QnAmodel(queryinput, docinput,k,category, model_name)
    answeregs.append(ansreg)
    references.append({"id": i, "answers": r["answers"]})
    formatted_predictions.append({"id": i, "prediction_text": answer})
   

0it [00:00, ?it/s]INFO:absl:Using /tmp/tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/4'.
INFO:absl:Downloaded https://tfhub.dev/google/universal-sentence-encoder/4, Total size: 987.47MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/4'.


Downloading:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

  return array(a, dtype, copy=False, order=order)
18it [16:09, 53.86s/it]


In [None]:
questions = list()
contexts=list()
itr=0
for i,r in tqdm(tempdf.iterrows()):
    contexts.append(r["context"])
    questions.append(r["question"])

18it [00:00, 6031.11it/s]


In [None]:
!pip3 install datasets

Collecting datasets
  Downloading datasets-1.16.1-py3-none-any.whl (298 kB)
[K     |████████████████████████████████| 298 kB 5.1 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 9.7 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 10.0 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 9.9 MB/s 
Collecting async-timeout<5.0,>=4.0.0a3
  Downloading async_timeout-4.0.1-py3-none-any.whl (5.7 kB)
Collecting aiosignal>=1.1.2
  Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)
Collecting multidict<7.0,>=4.5
  Downloading multidict-5.2.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (

In [None]:
from datasets import load_metric

squad_v2 = False

metric = load_metric("squad_v2" if squad_v2 else "squad")
metric.compute(predictions=formatted_predictions, references=references)

Downloading:   0%|          | 0.00/1.73k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

{'exact_match': 0.0, 'f1': 2.8571776644798956}

In [None]:
gold_ans

['A foreign portfolio investor shall appoint a branch of a bank authorised by the Reserve Bank of India for opening a foreign currency denominated account and special non-resident rupee account before making any investments in India.',
 '(a) that the aid shall not exceed seventy five per cent. of the total expenditure on legal proceedings;(b) such aid shall not be considered for more than one legal proceeding in a particular matter;(c) if more than one investors’ association applies for seeking legal aid, the investors’ association whose application is received first, shall be considered for such aid',
 'by simple majority of the shareholders in General Meeting',
 'in accordance with the investment objective of the relevant mutual fund scheme',
 'offer discretionary or non-discretionary or advisory services or a combination thereof to eligible investment funds',
 'reasonable grounds to believe that such company has been conducting in violation of these regulations;(3) to require any in

In [None]:
gold_ans = [ r["answers"]["text"][0]  for r in references]
answers = [f["prediction_text"] for f in formatted_predictions]

In [None]:
testingdf=pd.DataFrame({"questions":questions,"contexts":contexts,"pred_regs":answeregs,"gold_ans":gold_ans,"answers":answers})
testingdf

Unnamed: 0,questions,contexts,pred_regs,gold_ans,answers
0,What would be a designated bank under the fore...,A foreign portfolio investor shall appoint a b...,45. (1) The Securities and Exchange Board of I...,A foreign portfolio investor shall appoint a b...,Securities and Exchange Board of India
1,What are the conditions to providing aid to th...,Conditions for Aid.\nThe aid to investors’ ass...,“(6) The total expenses of the scheme excludin...,(a) that the aid shall not exceed seventy five...,overall ceilings
2,How are Sweat Equity Shares to be issued to Pr...,6. (1) In case of Issue of sweat equity shares...,96. (1) The provisions of this Chapter ...,by simple majority of the shareholders in Gene...,Indian Depository Receipts
3,How can money collected under a specific mutua...,43. The moneys collected under any scheme of a...,“(6) The total expenses of the scheme excludin...,in accordance with the investment objective of...,bonds
4,What are the obligations and responsibilities ...,19. An eligible fund manager shall be required...,8. For the purposes of determining whe...,offer discretionary or non-discretionary or ad...,Net worth Requirement
5,What are the situations wherein the Investigat...,6. Without prejudice to the powers conferred u...,44. In order to remove any difficulties in the...,reasonable grounds to believe that such compan...,"Repeal, rescission and saving"
6,What are the provisions under a code of conduc...,7. (1) Initial Disclosures. (a). Every promote...,91. The terms and conditions subject to ...,"protection against any discharge, termination,...",91
7,What are the situations under which the Board ...,5. (1) No application for settlement of any sp...,30. In order to remove any difficulty in the i...,if it is of the opinion that the alleged defau...,Irregularity in procedure
8,What is the primary function of Compensation C...,5. (1) A company shall constitute a compensati...,“(2) Every mutual fund shall along with the of...,formulate the detailed terms and conditions of...,pay filing fees
9,What are the requisites before a person can ma...,4. Irrespective of acquisition or holding of s...,34. The regulations specified in the Sch...,"no acquirer shall acquire, directly or indirec...",Repeal and Savings


In [None]:
testingdf.to_csv("fromNewest.csv",index=False)

In [None]:
tempdata=dict()
tempdata["first"]=list()
tempdata["second"]=list()
tempdata["third"]=list()
tempdata["fourth"]=list()
tempdata["fifth"]=list()


for i,r in tqdm(testingdf.iterrows()):
    queryinput = r["questions"]
    qvocab,query = queryvocab(queryinput)
    docinput = regextract(docinput, docs, docregs, mainvocab, vocabdef, queryinput, glossary)
    query, qvocab, importantwords, expansionwords = querypreprocess(query, qvocab, definitions, finaltopics)
    
    origreg,reverseMap = tfidfreg(docregs,docinput,category)
    
    d_cosines = []
    module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
    model = hub.load(module_url)
    def embed(input):
      return model(input)

    query_vector = embed([query])
    seg = int(len(origreg)/3)
    reg_vectors= embed(origreg[:seg])
    reg_vectors_2 = embed(origreg[seg:seg*2])
    reg_vectors_3 = embed(origreg[seg*2:])
    reg_vectors = tf.concat(axis=0, values = [reg_vectors, reg_vectors_2,reg_vectors_3])
    d_cosines=list()
    for r in reg_vectors.numpy():
        d_cosines.append(cosine_sim(query_vector.numpy()[0], r))

    out = np.array(d_cosines).argsort()[::-1]

    tempdata["first"].append(origreg[out[0]])
    tempdata["second"].append(origreg[out[1]])
    tempdata["third"].append(origreg[out[2]])
    tempdata["fourth"].append(origreg[out[3]])
    tempdata["fifth"].append(origreg[out[4]])


18it [10:32, 35.11s/it]


In [None]:
findings = pd.DataFrame(tempdata)
findings["gold"] = testingdf["contexts"]
findings

Unnamed: 0,first,second,third,fourth,fifth,gold
0,(2) In addition to the obligation of custodian...,(2) A foreign portfolio investor shall ensure ...,(3) When the foreign portfolio investor ...,(2) In case of jointly held depository ...,20. (1) A foreign portfolio investor shall inv...,A foreign portfolio investor shall appoint a b...
1,6. Conditions for Aid. The aid to investors’ a...,(4) The following conditions shall apply to so...,(2) Notwithstanding the conditions as specifie...,21. (1) The Sponsor and Manager of the A...,(4) of or to a specified investor unless the i...,Conditions for Aid.\nThe aid to investors’ ass...
2,(2) The issue of sweat equity shares to promot...,15. (1) The company shall ensure that - (a) ...,53[(5) The SR equity shares of promoters...,11. The amount of Sweat Equity shares i...,102[(5) If an issuer has issued SR equity shar...,6. (1) In case of Issue of sweat equity shares...
3,107. (1) In addition to other requiremen...,43. The moneys collected under any scheme of a...,232. (1) In addition to other requiremen...,10. (1) In addition to other requireme...,63. (1) In addition to other requirements laid...,43. The moneys collected under any scheme of a...
4,50[(6) The Manager shall be responsible for in...,(2) The Sponsor or Manager of Alternative...,(2) No custodian in which the sponsor or its ...,“(4) No person who is appointed as a trustee o...,7. For the purpose of grant of a certificate o...,19. An eligible fund manager shall be required...
5,"38. The inspecting authority shall, as soon as...","(4) The inspecting authority shall, for ...",(2) The authorised officer who is not t...,"(4) The inspecting authority shall, for the p...","22. The inspecting authority shall, as soon as...",6. Without prejudice to the powers conferred u...
6,"(3) ""company with family connection"" means: a....",190. (1) The issuer may offer its IDRs at diff...,49. Without prejudice to the exercise of...,21A. (1) A merchant banker shall not lead mana...,"(2) No person resident in India, except a rec...",7. (1) Initial Disclosures. (a). Every promote...
7,(3) The provisions of this regulation shall no...,26. Notwithstanding anything contained in ...,17. Notwithstanding anything contained in thes...,24. (1) Save as otherwise provided in these re...,(3) against whom the Board may initiate or has...,5. (1) No application for settlement of any sp...
8,(2) The Nomination and Remuneration Commit...,(2) The Nomination and Remuneration Commi...,"(2) Functional committee, comprising of: (a) ...","(2) Functional committee, comprising of: (a) ...",5. (1) A company shall constitute a com...,5. (1) A company shall constitute a compensati...
9,4. Irrespective of acquisition or holding of s...,5. (1) For the purposes of regulation 3 and r...,3. (1) No acquirer shall acquire shares or vo...,(2) A Collective Investment Management Co...,(2) The asset management company may appoint a...,4. Irrespective of acquisition or holding of s...


In [None]:
findings.to_csv("FindingsForRetrieval.csv",index=False)

In [None]:
without_preprocess_top_result = list()

for i,r in tqdm(testingdf.iterrows()):
    # qvocab,query = queryvocab(r["questions"])
    queryinput = r["questions"]
    query = queryinput
    docinput = regextract(docinput, docs, docregs, mainvocab, vocabdef, queryinput, glossary)
    # query, qvocab, importantwords, expansionwords = querypreprocess(query, qvocab, definitions, finaltopics)
    
    origreg,reverseMap = tfidfreg(docregs,docinput,category)
    
    d_cosines = []
    module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
    model = hub.load(module_url)
    def embed(input):
      return model(input)

    query_vector = embed([query])
    seg = int(len(origreg)/3)
    reg_vectors= embed(origreg[:seg])
    reg_vectors_2 = embed(origreg[seg:seg*2])
    reg_vectors_3 = embed(origreg[seg*2:])
    reg_vectors = tf.concat(axis=0, values = [reg_vectors, reg_vectors_2,reg_vectors_3])
    d_cosines=list()
    for r in reg_vectors.numpy():
        d_cosines.append(cosine_sim(query_vector.numpy()[0], r))

    out = np.array(d_cosines).argsort()[::-1]
    without_preprocess_top_result.append(origreg[out[0]])
      

10it [05:51, 35.15s/it]


In [None]:
findings = pd.DataFrame({"with_preprocessing":with_preprocess_top_result, "without_preprocessing":without_preprocess_top_result})

In [None]:
findings["gold"]=testingdf["contexts"]
findings

Unnamed: 0,with_preprocessing,without_preprocessing,gold
0,(2) In addition to the obligation of custodian...,25. (1) A foreign portfolio investor or a glob...,A foreign portfolio investor shall appoint a b...
1,6. Conditions for Aid. The aid to investors’ a...,6. Conditions for Aid. The aid to investors’ a...,Conditions for Aid.\nThe aid to investors’ ass...
2,(2) The issue of sweat equity shares to promot...,(2) The issue of sweat equity shares to promot...,6. (1) In case of Issue of sweat equity shares...
3,107. (1) In addition to other requiremen...,(3) Moneys collected under any money market sc...,43. The moneys collected under any scheme of a...
4,50[(6) The Manager shall be responsible for in...,19H. Units of angel funds shall not be listed ...,19. An eligible fund manager shall be required...
5,"38. The inspecting authority shall, as soon as...",(3) While undertaking an inspection under the...,6. Without prejudice to the powers conferred u...
6,"(3) ""company with family connection"" means: a....",7I. (1) Every person required to have a Code o...,7. (1) Initial Disclosures. (a). Every promote...
7,(3) The provisions of this regulation shall no...,(3) The provisions of this regulation shall no...,5. (1) No application for settlement of any sp...
8,(2) The Nomination and Remuneration Commi...,(2) The Nomination and Remuneration Commi...,5. (1) A company shall constitute a compensati...
9,4. Irrespective of acquisition or holding of s...,(9) The board of directors of the targe...,4. Irrespective of acquisition or holding of s...


In [None]:
findings.to_csv("findings.csv",index=False)

In [None]:
print(answeregs,file=open("answereg_regulations.txt","w"))


In [None]:
print(formatted_predictions,file=open("prediction_regulations.txt","w"))
print(references,file=open("references.txt","w"))

In [None]:
tempdf= df[df["doc_type"]=="misc"]
formatted_predictions=list()
references=list()
for i,r in tqdm(tempdf.iterrows()):
    category = r["doc_type"]
    queryinput=r["question"]
    answer, ansreg = QnAmodel(queryinput, docinput,k,category, model_name)
    references.append({"id": i, "answers": r["answers"]})
    formatted_predictions.append({"id": i, "prediction_text": answer})

  return array(a, dtype, copy=False, order=order)
51it [50:45, 59.71s/it]


In [None]:
metric.compute(predictions=formatted_predictions, references=references)

{'exact_match': 0.0, 'f1': 3.3010684379852737}

In [None]:
print(formatted_predictions,file=open("prediction_misc.txt","w"))

In [None]:
tempdf= df[df["doc_type"]=="legal case"]
formatted_predictions=list()
references=list()
for i,r in tqdm(tempdf.iterrows()):
    category = r["doc_type"]
    queryinput=r["question"]
    answer, ansreg = QnAmodel(queryinput, docinput,k,category, model_name)
    references.append({"id": i, "answers": r["answers"]})
    formatted_predictions.append({"id": i, "prediction_text": answer})

  return array(a, dtype, copy=False, order=order)
34it [43:08, 76.13s/it]


In [None]:
metric.compute(predictions=formatted_predictions, references=references)

{'exact_match': 0.0, 'f1': 2.2543544309596264}

In [None]:
print(formatted_predictions,file=open("prediction_legal.txt","w"))


In [None]:
print(formatted_predictions,file=open("CustomPredictions/prediction_roberta.txt","w"))
print(references,file=open("CustomPredictions/references.txt","w"))

In [None]:
model_name = "mrm8488/longformer-base-4096-finetuned-squadv2"

formatted_predictions=list()
references=list()
for i,r in tqdm(df.iterrows()):
    category = fileType(r["Document"])
    answer, ansreg = QnAmodel(r["question"], docinput,k,category, model_name)
    references.append({"id": r["id"], "answers": r["Answer Span"]})
    formatted_predictions.append({"id": r["id"], "prediction_text": answer})



In [None]:
metric.compute(predictions=formatted_predictions, references=references)

In [None]:
print(formatted_predictions,file=open("CustomPredictions/prediction_longformer.txt","w"))

In [None]:
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"

formatted_predictions=list()
references=list()
for i,r in tqdm(df.iterrows()):
    category = fileType(r["Document"])
    answer, ansreg = QnAmodel(r["question"], docinput,k,category, model_name)
    references.append({"id": r["id"], "answers": r["Answer Span"]})
    formatted_predictions.append({"id": r["id"], "prediction_text": answer})

metric.compute(predictions=formatted_predictions, references=references)

In [None]:
metric.compute(predictions=formatted_predictions, references=references)

In [None]:
print(formatted_predictions,file=open("CustomPredictions/prediction_bert.txt","w"))

In [None]:
model_name = "mrm8488/spanbert-finetuned-squadv1"

formatted_predictions=list()
references=list()
for i,r in tqdm(df.iterrows()):
    category = fileType(r["Document"])
    answer, ansreg = QnAmodel(r["question"], docinput,k,category, model_name)
    references.append({"id": r["id"], "answers": r["Answer Span"]})
    formatted_predictions.append({"id": r["id"], "prediction_text": answer})

metric.compute(predictions=formatted_predictions, references=references)

In [None]:
metric.compute(predictions=formatted_predictions, references=references)

In [None]:
print(formatted_predictions,file=open("CustomPredictions/prediction_spanbert.txt","w"))

In [None]:
model_name = "deepset/bert-base-cased-squad2"

formatted_predictions=list()
references=list()
for i,r in tqdm(df.iterrows()):
    category = fileType(r["Document"])
    answer, ansreg = QnAmodel(r["question"], docinput,k,category, model_name)
    references.append({"id": r["id"], "answers": r["Answer Span"]})
    formatted_predictions.append({"id": r["id"], "prediction_text": answer})

metric.compute(predictions=formatted_predictions, references=references)

In [None]:
metric.compute(predictions=formatted_predictions, references=references)

In [None]:
print(formatted_predictions,file=open("CustomPredictions/prediction_deepsetBertbase.txt","w"))


## Normal Predictions

In [None]:
import ast
import csv
# create a dictionary
data = []
csvFilePath = 'tempQnA.csv'

# Open a csv reader called DictReader
with open(csvFilePath, encoding='utf-8') as csvf:
    csvReader = csv.DictReader(csvf)

    # jsonf= open(jsonFilePath, 'w', encoding='utf-8')
    for id,rows in enumerate(csvReader):
        temp=dict()
        temp["id"]=id
        temp["data"]=rows
        temp["data"]['answers']= ast.literal_eval(rows["answers"])
        data.append(temp)

In [None]:
!pip3 install datasets transformers

Collecting datasets
  Downloading datasets-1.15.1-py3-none-any.whl (290 kB)
[?25l[K     |█▏                              | 10 kB 20.7 MB/s eta 0:00:01[K     |██▎                             | 20 kB 23.2 MB/s eta 0:00:01[K     |███▍                            | 30 kB 24.9 MB/s eta 0:00:01[K     |████▌                           | 40 kB 27.2 MB/s eta 0:00:01[K     |█████▋                          | 51 kB 29.7 MB/s eta 0:00:01[K     |██████▊                         | 61 kB 32.0 MB/s eta 0:00:01[K     |████████                        | 71 kB 28.5 MB/s eta 0:00:01[K     |█████████                       | 81 kB 29.5 MB/s eta 0:00:01[K     |██████████▏                     | 92 kB 30.6 MB/s eta 0:00:01[K     |███████████▎                    | 102 kB 32.2 MB/s eta 0:00:01[K     |████████████▍                   | 112 kB 32.2 MB/s eta 0:00:01[K     |█████████████▌                  | 122 kB 32.2 MB/s eta 0:00:01[K     |██████████████▋                 | 133 kB 32.2 MB/s et

In [None]:
from datasets import load_metric

squad_v2 = False

metric = load_metric("squad_v2" if squad_v2 else "squad")

Downloading:   0%|          | 0.00/1.73k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

In [None]:
from transformers import pipeline

model_name = "deepset/roberta-base-squad2"
model = pipeline(model=model_name, tokenizer=model_name, task="question-answering")


In [None]:
formatted_predictions=list()
references=list()
for d in data:
  r=d["data"]
  result = model(question = r["question"],context=r["context"])
  references.append({"id": r["id"], "answers": r["answers"]})
  formatted_predictions.append({"id": r["id"], "prediction_text": result['answer']})



  return array(a, dtype, copy=False, order=order)


In [None]:
metric.compute(predictions=formatted_predictions, references=references)

{'exact_match': 14.563106796116505, 'f1': 50.58989896858574}

In [None]:
print(formatted_predictions,file=open("Predictions/prediction_roberta.txt","w"))
print(references,file=open("Predictions/references.txt","w"))

In [None]:
model_name = "mrm8488/longformer-base-4096-finetuned-squadv2"

model = pipeline(model=model_name, tokenizer=model_name, task="question-answering")


Downloading:   0%|          | 0.00/757 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/567M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [None]:
formatted_predictions=list()
references=list()
for d in data:
  r=d["data"]
  result = model(question = r["question"],context=r["context"])
  references.append({"id": r["id"], "answers": r["answers"]})
  formatted_predictions.append({"id": r["id"], "prediction_text": result['answer']})



  return array(a, dtype, copy=False, order=order)


In [None]:
metric.compute(predictions=formatted_predictions, references=references)

{'exact_match': 13.592233009708737, 'f1': 45.494179337665344}

In [None]:
print(formatted_predictions,file=open("Predictions/prediction_longformer.txt","w"))

In [None]:
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"

model = pipeline(model=model_name, tokenizer=model_name, task="question-answering")


Downloading:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [None]:
formatted_predictions=list()
references=list()
for d in data:
  r=d["data"]
  result = model(question = r["question"],context=r["context"])
  references.append({"id": r["id"], "answers": r["answers"]})
  formatted_predictions.append({"id": r["id"], "prediction_text": result['answer']})



  return array(a, dtype, copy=False, order=order)


In [None]:
metric.compute(predictions=formatted_predictions, references=references)

{'exact_match': 15.533980582524272, 'f1': 53.892569072829474}

In [None]:
print(formatted_predictions,file=open("Predictions/prediction_bert.txt","w"))

In [None]:
model_name = "mrm8488/spanbert-finetuned-squadv1"

model = pipeline(model=model_name, tokenizer=model_name, task="question-answering")

Downloading:   0%|          | 0.00/493 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/413M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
formatted_predictions=list()
references=list()
for d in data:
  r=d["data"]
  result = model(question = r["question"],context=r["context"])
  references.append({"id": r["id"], "answers": r["answers"]})
  formatted_predictions.append({"id": r["id"], "prediction_text": result['answer']})



  return array(a, dtype, copy=False, order=order)


In [None]:
metric.compute(predictions=formatted_predictions, references=references)

{'exact_match': 11.650485436893204, 'f1': 46.83910790924299}

In [None]:
print(formatted_predictions,file=open("Predictions/prediction_spanbert.txt","w"))

In [None]:
model_name = "deepset/bert-base-cased-squad2"

model = pipeline(model=model_name, tokenizer=model_name, task="question-answering")


Downloading:   0%|          | 0.00/508 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/413M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/152 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
formatted_predictions=list()
references=list()
for d in data:
  r=d["data"]
  result = model(question = r["question"],context=r["context"])
  references.append({"id": r["id"], "answers": r["answers"]})
  formatted_predictions.append({"id": r["id"], "prediction_text": result['answer']})


  return array(a, dtype, copy=False, order=order)


In [None]:
metric.compute(predictions=formatted_predictions, references=references)

{'exact_match': 10.679611650485437, 'f1': 41.617463559093416}

In [None]:
print(formatted_predictions,file=open("Predictions/prediction_deepsetBertbase.txt","w"))

## Outputs collations

In [None]:
import re
data = dict()
for fn in ["prediction_bert.txt","prediction_longformer.txt","prediction_spanbert.txt","prediction_deepsetBertbase.txt","prediction_roberta.txt","references.txt"]:
    temp = "Predictions/"
    with open(temp + fn) as f:
        file = f.read().strip('][')
    try:
        name=fn.split("prediction_")[1]
        temp= re.compile("(?<=prediction_text':).*?'}")
        results = re.findall(temp,file)
        results =[r[2:-2] for r in results]
    except:
        name=fn
        temp= re.compile("(?<=text': ).*?]")
        results = re.findall(temp,file)
        results =[r[2:-2] for r in results]
        
    name=name.split(".txt")[0]
    data[name]=results

In [None]:
mo = pd.DataFrame(data)

In [None]:
mo["question"] = df["question"]
temp = mo.columns.tolist()
temp.remove("question")
temp.remove("references")
mo = mo[["question","references"]+temp]
mo

Unnamed: 0,question,references,bert,longformer,spanbert,deepsetBertbase,roberta
0,Does an innocent recipient of UPSI have any de...,insider may prove his innocence by demonstrati...,It was proposed not to bring such a defense in...,An insider may prove his innocence,An insider may prove his innocence,An insider may prove his innocence,An insider may prove his innocence
1,Will a promoter group entity require a pre-cle...,"only by ""Designated persons"" if the value of t...",will be required to obtain pre-clearance for t...,"if designated as a ""designated person"" by the ...",a promoter,will be required to obtain pre-clearance for t...,pre-clearance is required to be obtained only ...
2,Is inter-se off-market transfer of shares betw...,promoters have the option to convert warrants ...,will also attract the contra-trade restriction...,may attract the contra-trade restrictions,if the promoters or members of the promoter group,will also attract the contra-trade restriction...,contra-trade restrictions
3,Can an AIF invest its unutilized funds in liqu...,may invest investment income or investment pro...,may invest investment income or investment pro...,in liquid mutual funds,liquid mutual funds,liquid,may invest investment income
4,What are the penal consequences of not furnish...,attract the penalty prescribed under section 1...,penalty prescribed under section 15A of the SE...,the Investigating Authority of SEBI,the penalty,the penalty,penalty prescribed under section 15A of the SE...
...,...,...,...,...,...,...,...
98,What is the Section 21 of Securities Contract ...,vehicles allowing for adding of projects in fu...,vehicles allowing for adding of projects in fu...,vehicles allowing for adding of projects in fu...,vehicles,vehicles allowing for adding of projects in fu...,vehicles allowing for adding of projects in fu...
99,What does the section 11A(2) of the SEBI Act do?,contains the basic framework for\ncompanies to...,the basic framework for\ncompanies to buy back...,the basic framework for\ncompanies to buy back...,the basic framework for\ncompanies to buy back...,the basic framework for\ncompanies to buy back...,the basic framework for\ncompanies to buy back...
100,What is the Clause-41 of Equity Listing Agreem...,every buy back shall\nbe completed within a pe...,every buy back shall\nbe completed within a pe...,every buy back shall\nbe completed within a pe...,every buy back shall\nbe completed within a pe...,every buy back shall\nbe completed within a pe...,every buy back shall\nbe completed within a pe...
101,What is the o SEBI Committee on Disclosures an...,prohibits only back to back\nbuy backs through...,prohibits only back to back\nbuy backs through...,prohibits only back to back\nbuy backs through...,prohibits only back to back\nbuy backs through...,prohibits only back to back\nbuy backs through...,prohibits only back to back\nbuy backs through...


In [None]:
mo.to_csv("Predictions/ModelOutputs.csv",index=False)