#### E2E Demo Augmented Query Geneation and Semantic Search with Cross Encoder refinement

    0) Tokenization of long passages using window and stride
    1) Unsupervised Query Generation 
    2) Fine Tuning Bi-Encoder to use for semantic search 
    3) Creation of Faiss index using bi-encoder encoded passages
    4) Incorporation of Cross-Encoder on top of results returned from Bi-Encoder retreival 

In [1]:
import sys,os,logging, gc
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer, T5Tokenizer,T5TokenizerFast, T5ForConditionalGeneration
from sentence_transformers import util , CrossEncoder
import torch 
#set up basic logging
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)
logger =  logging.getLogger(__name__)

#config path 
root_ = os.path.abspath("")
cfg_path = Path(root_) / "config.yaml"

#custom imports
sys.path.append(root_)
from util.misc import LoadCFG, seed_all, create_output_dirs
from util.data import load_data
from util.embedding_ops import query_ops
from util.model_ops import build_model , load_model
from util.index_ops import faiss_index 

#set seed 
SEED = 42
seed_all(SEED)


  from .autonotebook import tqdm as notebook_tqdm
INFO:faiss.loader:Loading faiss with AVX2 support.
INFO:faiss.loader:Could not load library with AVX2 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx2'")
INFO:faiss.loader:Loading faiss.
INFO:faiss.loader:Successfully loaded faiss.


### LOAD CONFIG PARAMETERS

In [2]:

#load cfg params
cfg = LoadCFG(cfg_path, base_dir = root_).load()
DATA_PATH = cfg.data.input.data_path
SAVE_DIR = Path(cfg.data.output.data_save_dir)
MODEL_SAVE_DIR = Path(cfg.model.model_save_dir)
INDEX_SAVE_DIR = Path(cfg.model.ir.faiss_index.out_dir)

NSAMPS = cfg.model.n_samps
TOK_BATCH_SIZE = cfg.model.tokenizer.batch_size
BI_ENCODER_MODEL_NAME = cfg.model.ir.bi_encoder.model_name
EPOCHS = cfg.model.n_epochs
BI_ENCODER_BATCH_SIZE =  cfg.model.ir.bi_encoder.batch_size
CROSS_ENCODER_MODEL_NAME = cfg.model.ir.cross_encoder.model_name

#tokenizer setup
RETURN_TENSORS = cfg.model.tokenizer.return_tensors
PADDING =  cfg.model.tokenizer.padding
RETURN_OVERFLOW_TOKENS= cfg.model.tokenizer.return_overflow_tokens
MAX_SEQ_LEN = cfg.model.tokenizer.max_seq_len
TRUNCATION = cfg.model.tokenizer.truncation 
STRIDE = cfg.model.tokenizer.stride 

#query generator setup
GENQ_MODEL_NAME = cfg.model.query_gen.model_name 
N_QUERIES_PER_PASSAGE =  cfg.model.query_gen.n_queries_per_passage 

#device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#clean up gpu 
torch.cuda.empty_cache()
gc.collect()
logger.info(torch.cuda.memory_summary(device=DEVICE, abbreviated=True))

#create output dirs
create_output_dirs(SAVE_DIR, MODEL_SAVE_DIR)

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------------------|
| Active memory         |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------------------|
| GPU reserved memory   |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------------------|
| Non-releasable memory |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------

File DOES exist:
	 c:\Users\zjc10\Desktop\Projects\code\MyModules\semantic_search\genq_pinecone\config.yaml


### Load models used in demo

In [3]:
qgen_model = T5ForConditionalGeneration.from_pretrained(GENQ_MODEL_NAME)
tokenizer = T5TokenizerFast.from_pretrained(GENQ_MODEL_NAME, do_lower_case=False)
ir_model = load_model(str(MODEL_SAVE_DIR / 'fine_tuned_biencoder'), DEVICE)
_ce = CrossEncoder(CROSS_ENCODER_MODEL_NAME)

#call eval() to force / ensure model is running in 'INFERENCE MODE' and not 'TRAINING' mode
logger.info('forcing model into eval mode')
qgen_model.eval()
model = qgen_model.to(DEVICE)
print(DEVICE)

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: C:\Users\zjc10\Desktop\Projects\data\semantic_search\models\fine_tuned_biencoder
INFO:sentence_transformers.cross_encoder.CrossEncoder:Use pytorch device: cuda
INFO:__main__:forcing model into eval mode


cuda


#### LOAD DATA FOR QUERY GENERATION AND FINE TUNING 

In [4]:
logging.info('loading data from huggyface')
df = load_data( load_from_directory=False 
               , hf_dataset_name = 'squad' 
               , split ='train') 
df[:1]


# we want to emulate the scenario in which we do not have queries. 
# We will remove all but the 'context' data to do that. (aka all that is passed into framework is list of text)
logging.info('extracting text passages to generate queries for')
passages = list(set(df['context']))[2300:2300+ NSAMPS]
passages = [(idx,txt) for idx,txt in enumerate(passages)]
print(len(passages))

INFO:root:loading data from huggyface
INFO:root:extracting text passages to generate queries for


20


### Initalize Query Generation Instance 
0) tokenize text into overlapping spanning windows 
1) create mapping of original input doc to associated tokeinized chunks 
2) for each chunk, generate 3 queries using t5
3) return output data with 3 queries per input chunk

In [7]:
#initalize class to generate queries from passages
logger.info('initalize embedding querier')
queryer = query_ops(
     tokenizer
    , qgen_model 
    , SAVE_DIR
    , n_queries_per_passage = N_QUERIES_PER_PASSAGE
    , save_batch_size = 1000
    , train_batch_size = TOK_BATCH_SIZE    
    , return_tensors = RETURN_TENSORS
    , padding =  PADDING
    , return_overflowing_tokens= RETURN_OVERFLOW_TOKENS
    , max_seq_len = MAX_SEQ_LEN
    , truncation = TRUNCATION 
    , stride = STRIDE 
    )

#generate query,passage key value pairs , save to disk , return paths 
logger.info('generating query, passage key value pairs')
query_passage_outpaths = queryer.gen_queries_from_passages(passages)
                  
#create sentence_transformers comptable training dataset using InputExample() method from transformers
logger.info('creating training data for bi-encoder fine tuning')

#create train df, including docidx, and chunk idx information
train_df , pairs= queryer.create_training_data( query_passage_outpaths)

#create object to handle loading of InputExample() instances in batches of 50 
logger.info('creating loader to handle loading batches of data for model training')

#show mapping of chunks back to original doc 
pd.set_option('display.max_colwidth', None)
train_df[train_df['doc']==0]

Unnamed: 0,text,doc,chunk,ec_query_txt,_index,passage
0,"Jeffrey Long and Rick Kittles give a long critique of the application of FST to human populations in their 2003 paper ""Human Genetic Diversity and the Nonexistence of Biological Races"". They find that the figure of 85% is misleading because it implies that all human populations contain on average 85% of all genetic diversity. They claim that this does not correctly reflect human population history, because it treats all human groups as independent. A more realistic portrayal of the way human groups are related is to understand that some human groups are parental to other groups and that these groups represent paraphyletic groups to their descent groups. For example, under the recent African origin theory the human population in Africa is paraphyletic to all other</s>",0,0,what percent of the human population contains genetic diversity,0_0,"0_0:Jeffrey Long and Rick Kittles give a long critique of the application of FST to human populations in their 2003 paper ""Human Genetic Diversity and the Nonexistence of Biological Races"". They find that the figure of 85% is misleading because it implies that all human populations contain on average 85% of all genetic diversity. They claim that this does not correctly reflect human population history, because it treats all human groups as independent. A more realistic portrayal of the way human groups are related is to understand that some human groups are parental to other groups and that these groups represent paraphyletic groups to their descent groups. For example, under the recent African origin theory the human population in Africa is paraphyletic to all other</s>"
1,"Jeffrey Long and Rick Kittles give a long critique of the application of FST to human populations in their 2003 paper ""Human Genetic Diversity and the Nonexistence of Biological Races"". They find that the figure of 85% is misleading because it implies that all human populations contain on average 85% of all genetic diversity. They claim that this does not correctly reflect human population history, because it treats all human groups as independent. A more realistic portrayal of the way human groups are related is to understand that some human groups are parental to other groups and that these groups represent paraphyletic groups to their descent groups. For example, under the recent African origin theory the human population in Africa is paraphyletic to all other</s>",0,0,what percent of human population is genetically diverse,0_0,"0_0:Jeffrey Long and Rick Kittles give a long critique of the application of FST to human populations in their 2003 paper ""Human Genetic Diversity and the Nonexistence of Biological Races"". They find that the figure of 85% is misleading because it implies that all human populations contain on average 85% of all genetic diversity. They claim that this does not correctly reflect human population history, because it treats all human groups as independent. A more realistic portrayal of the way human groups are related is to understand that some human groups are parental to other groups and that these groups represent paraphyletic groups to their descent groups. For example, under the recent African origin theory the human population in Africa is paraphyletic to all other</s>"
2,"Jeffrey Long and Rick Kittles give a long critique of the application of FST to human populations in their 2003 paper ""Human Genetic Diversity and the Nonexistence of Biological Races"". They find that the figure of 85% is misleading because it implies that all human populations contain on average 85% of all genetic diversity. They claim that this does not correctly reflect human population history, because it treats all human groups as independent. A more realistic portrayal of the way human groups are related is to understand that some human groups are parental to other groups and that these groups represent paraphyletic groups to their descent groups. For example, under the recent African origin theory the human population in Africa is paraphyletic to all other</s>",0,0,how fst works for human diversity,0_0,"0_0:Jeffrey Long and Rick Kittles give a long critique of the application of FST to human populations in their 2003 paper ""Human Genetic Diversity and the Nonexistence of Biological Races"". They find that the figure of 85% is misleading because it implies that all human populations contain on average 85% of all genetic diversity. They claim that this does not correctly reflect human population history, because it treats all human groups as independent. A more realistic portrayal of the way human groups are related is to understand that some human groups are parental to other groups and that these groups represent paraphyletic groups to their descent groups. For example, under the recent African origin theory the human population in Africa is paraphyletic to all other</s>"
3,"more realistic portrayal of the way human groups are related is to understand that some human groups are parental to other groups and that these groups represent paraphyletic groups to their descent groups. For example, under the recent African origin theory the human population in Africa is paraphyletic to all other human groups because it represents the ancestral group from which all non-African populations derive, but more than that, non-African groups only derive from a small non-representative sample of this African population. This means that all non-African groups are more closely related to each other and to some African groups (probably east Africans) than they are to others, and further that the migration out of Africa represented</s>",0,1,are all non-african humans related to all african,0_1,"0_1:more realistic portrayal of the way human groups are related is to understand that some human groups are parental to other groups and that these groups represent paraphyletic groups to their descent groups. For example, under the recent African origin theory the human population in Africa is paraphyletic to all other human groups because it represents the ancestral group from which all non-African populations derive, but more than that, non-African groups only derive from a small non-representative sample of this African population. This means that all non-African groups are more closely related to each other and to some African groups (probably east Africans) than they are to others, and further that the migration out of Africa represented</s>"
4,"more realistic portrayal of the way human groups are related is to understand that some human groups are parental to other groups and that these groups represent paraphyletic groups to their descent groups. For example, under the recent African origin theory the human population in Africa is paraphyletic to all other human groups because it represents the ancestral group from which all non-African populations derive, but more than that, non-African groups only derive from a small non-representative sample of this African population. This means that all non-African groups are more closely related to each other and to some African groups (probably east Africans) than they are to others, and further that the migration out of Africa represented</s>",0,1,why is africa a paraphyletic group,0_1,"0_1:more realistic portrayal of the way human groups are related is to understand that some human groups are parental to other groups and that these groups represent paraphyletic groups to their descent groups. For example, under the recent African origin theory the human population in Africa is paraphyletic to all other human groups because it represents the ancestral group from which all non-African populations derive, but more than that, non-African groups only derive from a small non-representative sample of this African population. This means that all non-African groups are more closely related to each other and to some African groups (probably east Africans) than they are to others, and further that the migration out of Africa represented</s>"
5,"more realistic portrayal of the way human groups are related is to understand that some human groups are parental to other groups and that these groups represent paraphyletic groups to their descent groups. For example, under the recent African origin theory the human population in Africa is paraphyletic to all other human groups because it represents the ancestral group from which all non-African populations derive, but more than that, non-African groups only derive from a small non-representative sample of this African population. This means that all non-African groups are more closely related to each other and to some African groups (probably east Africans) than they are to others, and further that the migration out of Africa represented</s>",0,1,which human group is paraphyletic?,0_1,"0_1:more realistic portrayal of the way human groups are related is to understand that some human groups are parental to other groups and that these groups represent paraphyletic groups to their descent groups. For example, under the recent African origin theory the human population in Africa is paraphyletic to all other human groups because it represents the ancestral group from which all non-African populations derive, but more than that, non-African groups only derive from a small non-representative sample of this African population. This means that all non-African groups are more closely related to each other and to some African groups (probably east Africans) than they are to others, and further that the migration out of Africa represented</s>"
6,"only derive from a small non-representative sample of this African population. This means that all non-African groups are more closely related to each other and to some African groups (probably east Africans) than they are to others, and further that the migration out of Africa represented a genetic bottleneck, with much of the diversity that existed in Africa not being carried out of Africa by the emigrating groups. This view produces a version of human population movements that do not result in all human populations being independent; but rather, produces a series of dilutions of diversity the further from Africa any population lives, each founding event representing a genetic subset of its parental population. Long</s>",0,2,where was genetic diversity carried out in africa?,0_2,"0_2:only derive from a small non-representative sample of this African population. This means that all non-African groups are more closely related to each other and to some African groups (probably east Africans) than they are to others, and further that the migration out of Africa represented a genetic bottleneck, with much of the diversity that existed in Africa not being carried out of Africa by the emigrating groups. This view produces a version of human population movements that do not result in all human populations being independent; but rather, produces a series of dilutions of diversity the further from Africa any population lives, each founding event representing a genetic subset of its parental population. Long</s>"
7,"only derive from a small non-representative sample of this African population. This means that all non-African groups are more closely related to each other and to some African groups (probably east Africans) than they are to others, and further that the migration out of Africa represented a genetic bottleneck, with much of the diversity that existed in Africa not being carried out of Africa by the emigrating groups. This view produces a version of human population movements that do not result in all human populations being independent; but rather, produces a series of dilutions of diversity the further from Africa any population lives, each founding event representing a genetic subset of its parental population. Long</s>",0,2,why are groups of african ancestors related to other groups,0_2,"0_2:only derive from a small non-representative sample of this African population. This means that all non-African groups are more closely related to each other and to some African groups (probably east Africans) than they are to others, and further that the migration out of Africa represented a genetic bottleneck, with much of the diversity that existed in Africa not being carried out of Africa by the emigrating groups. This view produces a version of human population movements that do not result in all human populations being independent; but rather, produces a series of dilutions of diversity the further from Africa any population lives, each founding event representing a genetic subset of its parental population. Long</s>"
8,"only derive from a small non-representative sample of this African population. This means that all non-African groups are more closely related to each other and to some African groups (probably east Africans) than they are to others, and further that the migration out of Africa represented a genetic bottleneck, with much of the diversity that existed in Africa not being carried out of Africa by the emigrating groups. This view produces a version of human population movements that do not result in all human populations being independent; but rather, produces a series of dilutions of diversity the further from Africa any population lives, each founding event representing a genetic subset of its parental population. Long</s>",0,2,why was there a movement in the human population away from africa,0_2,"0_2:only derive from a small non-representative sample of this African population. This means that all non-African groups are more closely related to each other and to some African groups (probably east Africans) than they are to others, and further that the migration out of Africa represented a genetic bottleneck, with much of the diversity that existed in Africa not being carried out of Africa by the emigrating groups. This view produces a version of human population movements that do not result in all human populations being independent; but rather, produces a series of dilutions of diversity the further from Africa any population lives, each founding event representing a genetic subset of its parental population. Long</s>"
9,". This view produces a version of human population movements that do not result in all human populations being independent; but rather, produces a series of dilutions of diversity the further from Africa any population lives, each founding event representing a genetic subset of its parental population. Long and Kittles find that rather than 85% of human genetic diversity existing in all human populations, about 100% of human diversity exists in a single African population, whereas only about 70% of human genetic diversity exists in a population derived from New Guinea. Long and Kittles argued that this still produces a global human population that is genetically homogeneous compared to other mammalian populations.</s><pad><pad>",0,3,what percent of genetic variation exists in african humans,0_3,"0_3:. This view produces a version of human population movements that do not result in all human populations being independent; but rather, produces a series of dilutions of diversity the further from Africa any population lives, each founding event representing a genetic subset of its parental population. Long and Kittles find that rather than 85% of human genetic diversity existing in all human populations, about 100% of human diversity exists in a single African population, whereas only about 70% of human genetic diversity exists in a population derived from New Guinea. Long and Kittles argued that this still produces a global human population that is genetically homogeneous compared to other mammalian populations.</s><pad><pad>"


### Train Information Retreival model using Augmented (Query , chunk) inputs  

In [8]:
#build and train the bi-encoder to be used for asymetric search (information retrieval)
#the trained model will encode passages into embeddings that are trained to be queried via short questions (as oppposed to just blindly taking the cossime between a short a long seq of text)
logger.info('building model')
ir_model  = build_model(pairs
                    , BI_ENCODER_MODEL_NAME
                    , str(MODEL_SAVE_DIR / 'fine_tuned_biencoder')
                    , epochs=EPOCHS
                    , batch_size = BI_ENCODER_BATCH_SIZE
                    )

del ir_model
#build index to encode a fast query trained asyemetric embeddings
ir_model = load_model(str(MODEL_SAVE_DIR / 'fine_tuned_biencoder'), DEVICE)
ir_model.eval()

INFO:__main__:building model
Some weights of MPNetModel were not initialized from the model checkpoint at microsoft/mpnet-base and are newly initialized: ['mpnet.pooler.dense.weight', 'mpnet.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cuda
Iteration: 100%|██████████| 19/19 [00:16<00:00,  1.14it/s]
Iteration: 100%|██████████| 19/19 [00:16<00:00,  1.15it/s]
Epoch: 100%|██████████| 2/2 [00:33<00:00, 16.62s/it]
INFO:sentence_transformers.SentenceTransformer:Save model to C:\Users\zjc10\Desktop\Projects\data\semantic_search\models\fine_tuned_biencoder
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: C:\Users\zjc10\Desktop\Projects\data\semantic_search\models\fine_tuned_biencoder


SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

### Create FAISS INDEX WITH PRE-TRAINED BI-ENCODER and optinal CROSS ENCODER FOR FINE TUNING RESULTS

In [9]:
#create passage embeddings using the  new fine tuned bi - encoder
#define index object parameters
f_idx = faiss_index(train_df[['_index','passage']].drop_duplicates().reset_index(drop=True) #df
                    , ir_model #model
                    , ir_model[1].word_embedding_dimension
                    , text_col = 'passage'
                    , id_col = '_index'
                    , index_outpath = INDEX_SAVE_DIR
                    , cross_encoder_model_name = CROSS_ENCODER_MODEL_NAME
                    )
#create index
index_outp , data_outp, id_outp = f_idx.create_index()

INFO:sentence_transformers.cross_encoder.CrossEncoder:Use pytorch device: cuda
Batches: 100%|██████████| 2/2 [00:00<00:00,  2.13it/s]


In [21]:
#search index 
query_ = 'who is zack?'
results = f_idx.search(query_,10, refine_with_crossencoder=False)


Batches: 100%|██████████| 1/1 [00:00<00:00, 39.17it/s]

>>>> Results in Total Time: 0.03685927391052246
(array([[4.886207 , 4.7444015, 4.593949 , 4.586941 , 4.3751616, 4.353961 ,
        4.3052807, 4.2540026, 4.154897 , 4.075979 ]], dtype=float32), array([[13, 12, 29, 18, 23, 31, 28, 20, 24,  8]], dtype=int64))
[8, 12, 13, 18, 20, 23, 24, 28, 29, 31]
[4.886207103729248, 4.744401454925537, 4.593948841094971, 4.586940765380859, 4.375161647796631, 4.353960990905762, 4.305280685424805, 4.254002571105957, 4.154897212982178, 4.075979232788086]
{'3_2': {'text': '3_2:at least in the sciences, to choose epistemological foundations and methods. For instance, Melanchthon and his disciples at University of Wittenberg were instrumental for integrating Copernican mathematical constructs into astronomical debate and instruction. Another example was the short-lived but fairly rapid adoption of Cartesian epistemology and methodology in European universities, and the debates surrounding that adoption, which led to more mechanistic approaches to scientific pr




In [20]:
results = f_idx.search('who is zack',10, refine_with_crossencoder=True)


Batches: 100%|██████████| 1/1 [00:00<00:00, 49.93it/s]


>>>> Results in Total Time: 0.03305244445800781
(array([[4.886207 , 4.7444015, 4.593949 , 4.586941 , 4.3751616, 4.353961 ,
        4.3052807, 4.2540026, 4.154897 , 4.075979 ]], dtype=float32), array([[13, 12, 29, 18, 23, 31, 28, 20, 24,  8]], dtype=int64))
[8, 12, 13, 18, 20, 23, 24, 28, 29, 31]
[4.886207103729248, 4.744401454925537, 4.593948841094971, 4.586940765380859, 4.375161647796631, 4.353960990905762, 4.305280685424805, 4.254002571105957, 4.154897212982178, 4.075979232788086]
{'3_2': {'text': '3_2:at least in the sciences, to choose epistemological foundations and methods. For instance, Melanchthon and his disciples at University of Wittenberg were instrumental for integrating Copernican mathematical constructs into astronomical debate and instruction. Another example was the short-lived but fairly rapid adoption of Cartesian epistemology and methodology in European universities, and the debates surrounding that adoption, which led to more mechanistic approaches to scientific pr

Batches: 100%|██████████| 1/1 [00:00<00:00, 73.90it/s]
