In [1]:
import sys,os,logging, gc
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer, T5Tokenizer,T5TokenizerFast, T5ForConditionalGeneration
import torch 
#set up basic logging
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)
logger =  logging.getLogger(__name__)

#config path 
root_ = os.path.abspath("")
cfg_path = Path(root_) / "config.yaml"

#custom imports
sys.path.append(root_)
from util.misc import LoadCFG, seed_all
from util.data import load_data
from util.embedding_ops import query_ops
from util.model_ops import build_model 
from util.index_ops import ScalableSemanticSearch

#set seed 
SEED = 42
seed_all(SEED)

#load cfg params
cfg = LoadCFG(cfg_path, base_dir = root_).load()
DATA_PATH = cfg.data.input.data_path
SAVE_DIR = Path(cfg.data.output.data_save_dir)
MODEL_SAVE_DIR = Path(cfg.model.model_save_dir)

NSAMPS = cfg.model.n_samps
TOK_BATCH_SIZE = cfg.model.tokenizer.batch_size
BI_ENCODER_MODEL_NAME = cfg.model.bi_encoder.model_name
EPOCHS = cfg.model.n_epochs
BI_ENCODER_BATCH_SIZE =  cfg.model.bi_encoder.batch_size


#device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#tokenizer setup
return_tensors = cfg.model.tokenizer.return_tensors
padding =  cfg.model.tokenizer.padding
return_overflow_tokens= cfg.model.tokenizer.return_overflow_tokens
max_seq_len = cfg.model.tokenizer.max_seq_len
truncation = cfg.model.tokenizer.truncation 
stride = cfg.model.tokenizer.stride 

#query generator setup
GENQ_MODEL_NAME = cfg.model.query_gen.model_name 
N_QUERIES_PER_PASSAGE =  cfg.model.query_gen.n_queries_per_passage 

#clean up gpu 
torch.cuda.empty_cache()
gc.collect()
logger.info(torch.cuda.memory_summary(device='cuda', abbreviated=True))

#create output folder if it doesnt exist
if not SAVE_DIR.is_dir():
    assert (not SAVE_DIR.is_file()), f'a directory to save outputs must be passed, you passed a full file path: {save_dir}'
    if not SAVE_DIR.parent.is_dir(): 
        os.mkdir(str(SAVE_DIR.parent))
        os.mkdir(str(SAVE_DIR))
    else:
        os.mkdir(str(SAVE_DIR))
    logger.info(f"new output directory created:{SAVE_DIR}")

if not MODEL_SAVE_DIR.is_dir():
    assert SAVE_DIR.parent.is_dir(), f'parent directory: {SAVE_DIR} does not exist'
    os.mkdir(str(MODEL_SAVE_DIR))

  from .autonotebook import tqdm as notebook_tqdm
INFO:faiss.loader:Loading faiss with AVX2 support.
INFO:faiss.loader:Could not load library with AVX2 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx2'")
INFO:faiss.loader:Loading faiss.
INFO:faiss.loader:Successfully loaded faiss.
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------------------|
| Active memory         |       0 B  |       0 B  |       0 B  |       0 B  |
|-------------------------------------------------------------------

File DOES exist:
	 c:\Users\zjc10\Desktop\Projects\code\MyModules\semantic_search\genq_pinecone\config.yaml


In [5]:
logging.info('loading data from huggyface')
df = load_data( load_from_directory=False 
               , hf_dataset_name = 'squad' 
               , split ='train') 

df[:1]

INFO:root:loading data from huggyface


{'id': ['5733be284776f41900661182'],
 'title': ['University_of_Notre_Dame'],
 'context': ['Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'],
 'question': ['To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?'],
 'answers': [{'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}]}

In [8]:
logging.info('extracting text passages to generate queries for')
passages = list(set(df['context']))[:NSAMPS]
print(len(passages))
print(passages[0])

INFO:root:extracting text passages to generate queries for


20
The beginning of the Neolithic culture is considered to be in the Levant (Jericho, modern-day West Bank) about 10,200 â€“ 8,800 BC. It developed directly from the Epipaleolithic Natufian culture in the region, whose people pioneered the use of wild cereals, which then evolved into true farming. The Natufian period was between 12,000 and 10,200 BC, and the so-called "proto-Neolithic" is now included in the Pre-Pottery Neolithic (PPNA) between 10,200 and 8,800 BC. As the Natufians had become dependent on wild cereals in their diet, and a sedentary way of life had begun among them, the climatic changes associated with the Younger Dryas are thought to have forced people to develop farming.


In [9]:
logger.info('creating tokenizer and model to use in bi-encoder')
logger.info('creating model to use in bi-encoder')
#tokenizer  = T5Tokenizer.from_pretrained(GENQ_MODEL_NAME, legacy=False) 
qgen_model = T5ForConditionalGeneration.from_pretrained(GENQ_MODEL_NAME)
tokenizer = T5TokenizerFast.from_pretrained(GENQ_MODEL_NAME, do_lower_case=False)

INFO:__main__:creating tokenizer and model to use in bi-encoder
INFO:__main__:creating model to use in bi-encoder
You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [10]:
#call eval() to force / ensure model is running in 'INFERENCE MODE' and not 'TRAINING' mode
logger.info('forcing model into eval mode')
qgen_model.eval()
model = qgen_model.to(device)
#tokenizer = tokenizer
print(device)

INFO:__main__:forcing model into eval mode


cuda


In [11]:
#initalize class to generate queries from passages
logger.info('initalize embedding querier')
queryer = query_ops(
     tokenizer
    , qgen_model 
    , SAVE_DIR
    , n_queries_per_passage = N_QUERIES_PER_PASSAGE
    , save_batch_size = 1000
    , train_batch_size = TOK_BATCH_SIZE    
    , return_tensors = return_tensors
    , padding =  padding
    , return_overflowing_tokens= return_overflow_tokens
    , max_seq_len = max_seq_len
    , truncation = truncation 
    , stride = stride 
    )

INFO:__main__:initalize embedding querier


In [12]:
#generate query,passage key value pairs , save to disk , return paths 
logger.info('generating query, passage key value pairs')
query_passage_outpaths = queryer.gen_queries_from_passages(passages)


INFO:__main__:generating query, passage key value pairs
2it [00:18,  9.40s/it]


In [19]:
queryer._passage2chunk_map[2:4]

[{'text': 'In 2012, resident foreigners made up 23.3% of the population. Most of these (64%) were from European Union or EFTA countries. Italians were the largest single group of foreigners with 15.6% of total foreign population. They were closely followed by Germans (15.2%), immigrants from Portugal (12.7%), France (5.6%), Serbia (5.3%), Turkey (3.8%), Spain (3.7%), and Austria (2%). Immigrants from Sri Lanka, most of them former Tamil refugees, were the largest group among people of Asian origin (6.3%). Additionally, the figures from 2012 show that 34.7% of the permanent resident population aged 15 or over in Switzerland, i.e. 2,335,000 persons, had an immigrant background. A third of this population (853,000) held Swiss citizenship. Four fifths of persons with an immigration background were themselves immigrants (first generation foreigners and native-born and naturalised Swiss citizens), whereas one fifth were born in Switzerland (second generation foreigners and native-born and na