# Pipeline for Question Answering -  ESG Assessment Projects BNP

>PARTIE CODE -------------------------------------------------------------------------------------------------

In [None]:
%reset

Table of content 

- [Code dependencies of the project](#Code-dependencies-of-the-project)
- [Args definition](#Arguments)
- [Question Answering Pipeline](#QA-Pipeline)
- [Main](#Main-program)

# Code dependencies of the project

In [1]:
## Pipeline for Question Answering on closed domain and non factoid questions - harshQA
## Developped by William Lambert  (Risk AIR Team , BNP Paribas)

import warnings
from utils.utils  import hide_warn
warnings.warn=hide_warn
import json
import os
import re
import sys
import uuid
import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import tensorflow as tf 
from sentence_transformers import SentenceTransformer
from string import digits
from sklearn.base import BaseEstimator
from tabulate import tabulate

#Bert_dependencies
from harshQA_reranker.tokenization import* 
import harshQA_reranker.metrics as metrics
import harshQA_reranker.modeling as modeling
import harshQA_reranker.optimization as optimization 

#Import our pdf reader
from harshQA_pdf_reader.reader import pdfconverter

#Import bert finetuned pipeline
from harshQA_reranker.harshQA_tfrecord import *
from harshQA_reranker.harshQA_bert_builder import * 
from harshQA_reranker.harshQA_run_msmarco import run_msmarco

#Import all our models, wrap in a scikit learn estimator
from harshQA_retrievers.m1__Infersent import m1_Infersent
from harshQA_retrievers.m2__Bert import m2_Bert
from harshQA_retrievers.m3__Tfidf import m3_Tfidf
from harshQA_retrievers.m5__harshQA import m5_harshQA

#Utils
from utils.utils import remove_non_alpha
from utils.utils import generate_querries

## Arguments

In [2]:
arguments={}

In [13]:
arguments["model"]=5
arguments["demo"]=True
arguments["demo_query"]='Does the company support local agriculture'
arguments["demo_topics"]='local agriculture'
arguments["top_n"]=5
arguments["query_dir"]='./data/pdf_files/Tourism/Queries.txt'
arguments["size_cluster"]=80
arguments["domain"]="Tourism"
arguments["retrieved_company"]="Disney"
arguments["pdf_directory"]="./data/pdf_files/"
arguments["vocab_file"]="./data/bert/pretrained_models/uncased_L-12_H-768_A-12/vocab.txt"
arguments["whole_corpus"]="./data/pdf_files/All"
arguments["vocab_builder"]="./data/corpusESG.json"
arguments["w2v_path"]="./data/fastText/crawl-300d-2M.vec"
arguments["model_path"]="./data/encoder/infersent2.pkl"
arguments["output_dir"]="./data/output"
arguments["bert_config_file"]="./data/bert_msmarco/bert_config.json"
arguments["init_checkpoint"]="./data/bert_msmarco/model.ckpt"
arguments["max_seq_length"]=512
arguments["max_query_length"]=128
arguments["msmarco_output"]=True
arguments["do_train"]=False
arguments["do_eval"]=True
arguments["train_batch_size"]=200
arguments["eval_batch_size"]=40
arguments["learning_rate"]=1e-6
arguments["num_train_steps"]=400000
arguments["max_eval_examples"]=None
arguments["num_warmup_steps"]=40000
arguments["save_checkpoints_steps"]=100
arguments["iterations_per_loop"]=10
arguments["min_gram"]=1
arguments["max_gram"]=1
arguments["lemmatize"]=False
arguments["transform_text"]=True
arguments["sentences_chunk"]=1
arguments['use_tpu']=False
arguments["tpu_name"]=None
arguments["tpu_zone"]=None
arguments["gcp_project"]=None
arguments["master"]=None
arguments["num_tpu_cores"]=8

In [14]:
def check_args():
    
    assert tf.__version__[0]=='1' , "This code has been implemented on tf 1.14, if you want to use you should modify the code"
    assert arguments['retrieved_company']!=None, "Select a Company"
    assert arguments['domain']!=None, "Select a Domain"
    assert arguments['model']!=None, "Select a Model"
    assert arguments['pdf_directory']!=None, "Select a $PATH which contains the folder Domain/Company/pdfs/ containing pdf files to query"
    assert arguments['whole_corpus']!=None, "Select a pdf directory path which contains all the pdf of the corpus to fit our model"
    assert arguments['vocab_builder']!=None, "Enter a .json path to save all our vocabulary while ingesting pdf files "
    assert arguments['lemmatize']==False or arguments['model']==3 ,"Lemmatize option is only available for Tf-Idf model (model n°3)"
    assert arguments['max_gram']==1 or arguments['model']==3 , "Multi gram option is only available for Tf-Idf model (model n°3)"
    assert arguments['transform_text']==True, "Your model will work better with a tokenizer that used (stemming/lemmatizing)"
    
    if arguments['model']==5:
        assert arguments['eval_batch_size']<=arguments['size_cluster'],"eval batch size should be less than the size of the cluster of preselected sentences"
        assert arguments['size_cluster']%arguments['eval_batch_size']==0,"eval batch size should be a multiple of the size of the cluster of preselected sentences"
        assert arguments['bert_config_file']!=None, "Enter a .json bert config file to specify the model architecture"
        assert arguments['vocab_file']!=None, "Enter the path of uncased_L-12_H-768_A-12/vocab.txt"
        assert "uncased_L-12_H-768_A-12" in arguments['vocab_file'], "You need to pass the vocab file of bert uncased L-12_H-768_A-12 "
        assert ".txt" in arguments['vocab_file'], "The bert vocab_file must be a .txt file"
        assert ".json" in arguments['bert_config_file'], "The bert_config_file must be a .json file"
        assert arguments['output_dir']!=None, "Enter the output directory where all the model bert,tfidf checkpoints will be written after train "
        "It will also store the raw tsv files and the tfrecords used to feed bert-reranker."
        assert arguments['init_checkpoint']!=None,"Enter a bert .ckpt init checkpoint"
        assert ".ckpt" in arguments['init_checkpoint'], "The init_checkpoint must be a .ckpt file"
        
    
        
    if arguments['model'] in [1,5]:
        assert arguments['w2v_path']!=None,"Specify the .vec file path of GloVe or fasText"
        assert ".vec" in arguments['w2v_path'], "The w2v path of GloVe or fasText muste be a .vec file"
        assert arguments['model_path']!=None,"Specify the .pkl file path of Infersent2"
        assert ".pkl" in arguments['model_path'], "The infersent model file muste be a .pkl file"
            
    
    if arguments['demo']:
        assert arguments['demo_query']!=None, "Specify a query for the demo"
        if arguments['model']==5:
            assert arguments['demo_topics']!=None, "Specify coma separated topics linked to your query for the demo"
    else:
        assert arguments['query_path']!=None , 'Specify a .txt file containing your queries line by line'
        
        
    

## QA Pipeline

In [15]:
class QApipeline():
    
    def __init__(self,**kwargs):


            #new kwargs: 'threshold' (float between 0.5 and 1.0)

        self.kwargs_converter = {key: value for key, value in kwargs.items()
                                if key in pdfconverter.__init__.__code__.co_varnames}

        self.kwargs_Tf_Idf = {key: value for key, value in kwargs.items()
                             if key in m3_Tfidf.__init__.__code__.co_varnames}

        self.kwargs_Infersent={key: value for key, value in kwargs.items()
                             if key in m1_Infersent.__init__.__code__.co_varnames}
        self.kwargs_Bert={key: value for key, value in kwargs.items()
                             if key in m2_Bert.__init__.__code__.co_varnames}


        self.MODELS=['INFERSENT - GLOVE','BERT PRETRAINED','TFIDF - LEMMATIZER & BIGRAM','BERT & TFIDF SHORT TEXT CLUSTERING','BERT FINETUNED & TFIDF SHORT TEXT CLUSTERING']
        self.usemodel=arguments['model']
        self.sentences_chunk=kwargs['sentences_chunk']
        return None


    def fit_reader(self,df=None):

        #CALL PDF READER 
        if df!=None:
            assert not False in [ col in df.columns.tolist() for col in ['directory_index','raw_paragraphs','paragraphs']], "The given dataframe is not of the proper format "
            self.df=df
        else:
            
            print('{}*********** READER *************'.format('\n'))
            print("Reading pdfs doc on location: {}".format(arguments['pdf_directory']+arguments['domain']+'/'+arguments['retrieved_company']+'/pdfs/'))
            self.df=pdfconverter(**self.kwargs_converter).transform()


            #print('You should give either your own harshQA dataframe to the fit_reader module or specify your pdf_directories, domain and retrieved_company FLAGS')



        #BUILD CONTENT AND DOCUMENT INDEX 
        self.content=[]
        self.content_raw=[]
        self.contents_doc=[]
        self.borders=[0]

        print('********* DOCUMENTS RETRIEVED **********')
        for j,repo in enumerate(sorted(list(set(self.df.directory_index)))):

            count_dic=[{},{}]
            remove_idx=[[],[]]
            content_doc=[]
            content_doc_raw=[]

            title=self.df[self.df.directory_index==repo].directory.tolist()[0]
            self.df[self.df.directory_index==repo]['raw_paragraphs'].apply(lambda sentences: self.update_count_dic(sentences,count_dic,0,remove_idx))
            self.df[self.df.directory_index==repo]['paragraphs'].apply(lambda sentences: self.update_count_dic(sentences,count_dic,1,remove_idx))
            self.df[self.df.directory_index==repo]['raw_paragraphs'].apply(lambda sentences: content_doc_raw.extend(sentences))
            self.df[self.df.directory_index==repo]['paragraphs'].apply(lambda sentences: content_doc.extend(sentences))

            #REMOVE TWIN SENTENCES AND REMOVE TOO SMALL SENTENCES
            remove_idx=list(set(remove_idx[0]+remove_idx[1]))
            content_doc=np.delete(np.array(content_doc),remove_idx)
            content_doc_raw=np.delete(np.array(content_doc_raw),remove_idx)

            content=[content_doc[i] for i in range(len(content_doc)) if (len(content_doc[i])>=50 )]
            content_raw=[content_doc_raw[i] for i in range(len(content_doc)) if (len(content_doc[i])>=50)]
            self.borders.append(len(content))

            print("FOLDER : {} , {} sentences \n \n".format(self.df.directory_index.unique()[j],len(content)))

            #ADD SENTENCES TO OUR FINAL OBJECTS
            self.content.extend(list(content))
            self.content_raw.extend(list(content_raw))
            self.contents_doc.append([content,content_raw])


        self.borders=list(np.cumsum(self.borders))

        #GROUP SENTENCES BY PAIR EVENTUALLY
        if self.sentences_chunk==2:

            self.content=[ ' '.join(x) for x in zip(self.content[0::2], self.content[1::2]) ]
            self.content_raw=[ ' '.join(x) for x in zip(self.content_raw[0::2], self.content_raw[1::2]) ]
            for i,(treated_sentences,raw_sentences) in enumerate(self.contents_doc):
                self.contents_doc[i][0]=[ ' '.join(x) for x in zip(treated_sentences[0::2], treated_sentences[1::2]) ]
                self.contents_doc[i][1]=[ ' '.join(x) for x in zip(raw_sentences[0::2], raw_sentences[1::2]) ]
            self.borders=[int(i/2) for i in self.borders]

        #REPLACE ALL DIGITS WITH SPECIAL TOKEN FOR OUR MODEL
        for i,c in enumerate(self.borders[:-1]):
            start_idx=self.borders[i]
            content=self.contents_doc[i][0]
            content_raw=self.contents_doc[i][1]

            #ADD TREATED TEXT TO CONTENTS_DOC
            for sentence_id,sentence in enumerate(content):

                words_list=sentence.split(" ")
                for word_id,w in enumerate(words_list):
                    try:
                        float(w)
                        words_list[word_id]="XXX"
                    except:
                        words_list[word_id]=w
                    self.content[start_idx+sentence_id]=" ".join(words_list)

                self.contents_doc[i][0][sentence_id]=" ".join(words_list)

        return self

    def fit(self):


        print('********* MODEL {} **********'.format(self.MODELS[self.usemodel-1]))

        if self.usemodel==1:
            #Fit Infersent
            self.model_retriever = m1_Infersent(**self.kwargs_Infersent)
            self.model_retriever.fit(self.content)
            self.model_retriever.transform(self.contents_doc[0][0])

        if self.usemodel==2:
            #Fit Bert pretrained

            self.model_retriever=m2_Bert(**self.kwargs_Bert)
            self.model_retriever.fit(self.content)#no finetuning for Bert
            self.model_retriever.transform(self.contents_doc[0][0])

        if self.usemodel==3:
            #Fit Tf-Idf model

            """
            This model is also called by HarshQA model (m5), so we need to turn off the saving output feature so that 
            it does not erase the harshQA output.
            """
            self.kwargs_Tf_Idf['save_idfs_path']=None
            self.kwargs_Tf_Idf['save_features_path']=None


            self.model_retriever = m3_Tfidf(**self.kwargs_Tf_Idf)
            self.model_retriever.fit(self.content)
            self.model_retriever.transform(self.contents_doc[0][0])



        if self.usemodel==5:
            #Fit harshQA model 

            output_TF=arguments['output_dir']+'/tf_idf_checkpoints/'

            args_harshQA={'save_kernel_path':output_TF+'kernel.npy',
                          'save_kernel_vocab_path':output_TF+'kernel_vocab.json',
                          'save_kernel_idx_path':output_TF+'kernel_vocab_idx.json',
                          'save_idfs_path':output_TF+'idfs.npy',
                          'save_features_path':output_TF+'vocab.json'}

            for key, value in arguments.items():
                if key in m5_harshQA.__init__.__code__.co_varnames:
                    args_harshQA[key]=value

            self.model_retriever=m5_harshQA(**args_harshQA)
            self.model_retriever.fit(self.content)
            self.model_retriever.transform(self.contents_doc[0])

        return self


        #Initialisation of Tf-Idf-Farahat

    def predict(self,Qst,Topics=None):
        """
        kwargs:
        ##VE_type: 'DP' for Detect Presence of 'VE' for Value extraction
        ##Qst: Querry
        ##VE_cdt : null
        """

        repo_to_query=0
        self.Qst_raw=Qst
        self.topics=Topics

        #Apply corpus transformations to querry before feeding it into our models
        newQst=[q.lower() for q in self.Qst_raw]
        newQst=[remove_non_alpha(q) for q in newQst]
        newQst=[q.replace('.','') for q in newQst]

        self.dataframe=[]        
        all_scores=[]
        all_models=[]
        all_querries=[]
        all_ranks=[]
        all_indices=[]
        all_answers=[]

        #Infersent retriever
        if self.usemodel !=5:
            for i,qu in enumerate(newQst):

                indices,scores=self.model_retriever.predict(qu)
                p=len(indices)
                all_scores.extend(scores.loc[indices].values[:,0])
                all_answers.extend([ self.contents_doc[repo_to_query][1][i] for i in indices])
                all_models.extend([self.MODELS[arguments['model']-1]]*p)
                all_ranks.extend(list(range(1,p+1)))
                all_querries.extend([self.Qst_raw[i]]*p)
                all_indices.extend(indices)


            self.dataframe=pd.DataFrame(np.c_[all_querries,all_models,all_ranks,all_indices,all_answers,all_scores],columns=['Question','Model','Rank','Doc_index','Answer','Score'])


        #harshQA retriever
        else:
            self.dataframe=self.model_retriever.predict(self.Qst_raw,self.topics)


        #FORMAT THE OUTPUT NICELY AND RETURN IT
        self.dataframe=self.dataframe.apply(self.add_ctxt,axis=1)
        self.dataframe['Rank']=self.dataframe['Rank'].map(lambda x: x[0])
        self.dataframe['Score']=self.dataframe['Score'].map(lambda x: np.round(float(x),4))
        self.dataframe['Company']=[arguments['retrieved_company']]*len(self.dataframe)
        self.dataframe=self.dataframe.sort_values(by=['Question','Company','Model','Rank']).reset_index(drop=True)[['Question','Company','Model','Answer','Rank','Score','Doc_index','Context_Answer']]
        return self.dataframe


    def string_retriever(self,sentence_list):
        return [w  for w in sentence_list if not w.isdigit()]

    def add_ctxt(self,row):
        try:
            row['Context_Answer']=' '.join([self.contents_doc[0][1][int(row.Doc_index)-1],row.Answer,self.contents_doc[0][1][int(row.Doc_index)+1]])
        except:
            print('No context for index:',row.Doc_index)
            row['Context Answer']= ' '
        return row

    def update_count_dic(self,sentences,counter,is_rawtext,remove_index):

        for i,c in enumerate(sentences):
            counter=counter[is_rawtext].copy()
            counter[c]=counter.get(c,0)+1
            counter[is_rawtext]=counter
            if counter[is_rawtext][c]>1:
                remove_index[is_rawtext].append(i)
        return None

## Choose your args

In [16]:
def collect_args():

    if not arguments['demo']:              
        path_q=arguments['query']      
        file= open(path_q,"r+")  
        text=file.read().replace("  ","")
        queries=text.split("\n")
        queries=[q.split("\t")[0] for q in queries if len(q)>1]
        topics=[q.split("\t")[1].split(",") for q in queries if len(q)>1]
        file.close()
        
    else:
        queries=[arguments['demo_query']]
        if arguments['model']==5:
            topics=arguments['demo_topics']
            topics=[topics.split(",")]
        
    pdf_dirs=[arguments['pdf_directory']+arguments['domain']+'/'+arguments['retrieved_company']]
    grams=(arguments['min_gram'],arguments['max_gram'])
    
    args_Infersent={'pdf_directories':pdf_dirs,'w2v_path': arguments['w2v_path'], 'model_path': arguments['model_path'] ,'top_n':arguments['top_n'],'ngram_range':grams,'lemmatize':arguments['lemmatize'],'transform_text':arguments['transform_text'],'l_questions':queries,'sentences_chunk':arguments['sentences_chunk'],'vocab_builder':arguments['vocab_builder']}
    args_Bert={'pdf_directories':pdf_dirs,'w2v_path': arguments['w2v_path'], 'model_path': arguments['model_path'] ,'top_n':arguments['top_n'],'ngram_range':grams,'lemmatize':arguments['lemmatize'],'transform_text':arguments['transform_text'],'l_questions':queries,'sentences_chunk':arguments['sentences_chunk'],'vocab_builder':arguments['vocab_builder']}
    args_Tf_Idf={'pdf_directories':pdf_dirs,'w2v_path': arguments['w2v_path'], 'model_path': arguments['model_path'] ,'top_n':arguments['top_n'],'ngram_range':grams,'lemmatize':arguments['lemmatize'],'transform_text':arguments['transform_text'],'l_questions':queries,'sentences_chunk':arguments['sentences_chunk'],'vocab_builder':arguments['vocab_builder']}
    args_TfBERT={'pdf_directories':pdf_dirs,'w2v_path': arguments['w2v_path'], 'model_path': arguments['model_path'] ,'top_n':arguments['top_n'],'ngram_range':grams,'lemmatize':arguments['lemmatize'],'transform_text':arguments['transform_text'],'l_questions':queries,'sentences_chunk':arguments['sentences_chunk'],'vocab_builder':arguments['vocab_builder']}
    args_TfBERT_enhanced={'pdf_directories':pdf_dirs,'w2v_path': arguments['w2v_path'], 'model_path': arguments['model_path'] ,'top_n':arguments['top_n'],'ngram_range':grams,'lemmatize':arguments['lemmatize'],'transform_text':arguments['transform_text'],'l_questions':queries,'sentences_chunk':arguments['sentences_chunk'],'vocab_builder':arguments['vocab_builder'],'topics':topics}
    args_All_transforms={'pdf_directories':pdf_dirs,'w2v_path': arguments['w2v_path'], 'model_path': arguments['model_path'] ,'top_n':arguments['top_n'],'ngram_range':grams,'lemmatize':arguments['lemmatize'],'transform_text':arguments['transform_text'],'l_questions':queries,'sentences_chunk':arguments['sentences_chunk'],'vocab_builder':arguments['vocab_builder'],'topics':topics}

    if arguments['model']==1:
        return args_Infersent
    elif arguments['model']==2:
        return args_Bert
    elif arguments['model']==3:
        return args_Tf_Idf
    elif arguments['model']==4:
        return args_TfBERT
    elif arguments['model']==5:
        return args_TfBERT_enhanced
    else:
        print('Select a correct model')


## Main program

In [17]:
window_size=80
dic_suffix={1:'FASTEXT',2:'BERT',3:'TFIDF',4:'BERTCLUST',5:'BERTCLUST_TUNED'}
args=collect_args()
check_args()
args_fit={key:value for key,value in args.items() if key not in ['l_questions','topics']}
QAmodel=QApipeline(**args) 
QAmodel.fit_reader()
QAmodel.fit()
results=QAmodel.predict(args['l_questions'],args.get('topics',[]))

if not arguments['demo']:
    dir=arguments['output_dir']+"/"+arguments['domain']+"/"+arguments['retrieved_company']
    if not os.path.exists(dir):
        os.makedirs(dir)
    results.to_csv(dir+'_result.csv')
else:
    print('*******************************  RESULTS of BERT  ***************************************')
    show=pd.DataFrame(results[['Score','Answer']].values,index=range(0,100*(arguments['top_n']),100),columns=['Score','Answer'])
    counter_newline={}
    for i,aw in enumerate(show.Answer.tolist()):
        dividend,quotient=len(aw)//window_size,len(aw)%window_size
        if quotient!=0:
            dividend+=1
        counter_newline[i]=dividend


        for newline in range(dividend):
            show.loc[(100*i)+(newline)]=[show.loc[100*i]['Score'],aw[window_size*newline:window_size*(newline+1)]]
    show=show.reset_index().sort_values(by='index')[['Score','Answer']].values
    indexs_tiled=np.concatenate([np.tile([i],counter_newline[i]) for i in range(arguments['top_n'])])
    print(tabulate(pd.DataFrame(show,index=['Answer °'+ str(i) for i in indexs_tiled]), headers='keys', tablefmt='psql'))




*********** READER *************
Reading pdfs doc on location: ./data/pdf_files/Tourism/Disney/pdfs/
files from ./data/pdf_files/Tourism/Disney/pdfs succesfully converted 


********* DOCUMENTS RETRIEVED **********
FOLDER : 1 , 1721 sentences 
 

********* MODEL BERT FINETUNED & TFIDF SHORT TEXT CLUSTERING **********
semantic kernel has been retrieved 



 Query : 	 Does the company support local agriculture 
 Topics : 	 local agriculture 
 Expansion : 	  | organ | product | biodivers 

********* CLUSTER OF 80  DOCS ********** =  
+----+---------------------------------------------------------------------------------------------------------+
|    | 0                                                                                                       |
|----+---------------------------------------------------------------------------------------------------------|
|  0 | It will also feature an array of exclusive original series and movies, along with titles/episodes fr... |
|  1 | Alw

Done!
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

INFO:tensorflow:Using config: {'_model_dir': './data/output/bert_checkpoints', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 100, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_service': None, '_cluster_spec': <ten

AttributeError: module '__main__' has no attribute 'main'

In [19]:
QAmodel.borders

[0, 1721]