In [1]:
from fastapi import FastAPI
from ingestion import DocumentIngestion
from retriever_adaption import DomainAdaptionPipeline
import os 

In [2]:
try:
    host = os.environ.get('ELASTICSEARCH_HOST')
except:
    print("ELASTICSEARCH_HOST host does not set as env parameter.")

In [3]:
host

'44.212.62.175'

### Test Ingestion API

In [5]:
ingest = DocumentIngestion()

In [7]:
bucket = 'domain-qa-system'
key = 'elasticsearch/ingestion/bioasq/bio_v1.json'
ingest.load_docs_s3(bucket, key)

In [9]:
index = 'bioasq-retriever-adaption'
ingested = ingest.write_docs(index, host=host)

  from .autonotebook import tqdm as notebook_tqdm
  if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package cmudict to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.


In [10]:
ingested

{'count': 1490,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}



### Test Adaption API

In [4]:
index = 'bioasq-1030'

In [5]:
adapt = DomainAdaptionPipeline()

In [6]:
adapt.init_docstore_retriever(index, host=host)

  from .autonotebook import tqdm as notebook_tqdm
  if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
Updating embeddings:   0%|                                                                                                                                                                      | 0/1490 [00:00<?, ? Docs/s]
Batches:   0%|                                                                                                                                                                                       | 0/47 [00:00<?, ?it/s][A
Batches:   2%|███▋                                                                                                                                                                           | 1/47 [00:00<00:13,  3.32it/s][A
Batches:   6%|███████████▏                                                                                                                                                                   | 3/47 [00:00<00:06,  7.00it/

In [7]:
adapt.retriever.__dict__

{'_component_config': {'params': {'document_store': <haystack.document_stores.elasticsearch.ElasticsearchDocumentStore at 0x7f431764a640>,
   'embedding_model': 'sentence-transformers/msmarco-distilbert-base-tas-b',
   'model_format': 'sentence_transformers',
   'max_seq_len': 256,
   'progress_bar': True},
  'type': 'EmbeddingRetriever'},
 'outgoing_edges': 1,
 'devices': [device(type='cuda', index=0)],
 'document_store': <haystack.document_stores.elasticsearch.ElasticsearchDocumentStore at 0x7f431764a640>,
 'embedding_model': 'sentence-transformers/msmarco-distilbert-base-tas-b',
 'model_version': None,
 'use_gpu': True,
 'batch_size': 32,
 'max_seq_len': 256,
 'pooling_strategy': 'reduce_mean',
 'emb_extraction_layer': -1,
 'top_k': 10,
 'progress_bar': True,
 'use_auth_token': None,
 'scale_score': True,
 'api_key': None,
 'model_format': 'sentence_transformers',
 'embedding_encoder': <haystack.nodes.retriever._embedding_encoder._SentenceTransformersEmbeddingEncoder at 0x7f42afa20c

In [8]:
adapt.generate_labels()

Using sep_token, but it is not set yet.
Generating questions: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3232/3232 [02:02<00:00, 26.39it/s]
Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 120.28it/s]
Querying: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:00<00:00, 92.87it/s]
Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 111.78it/s]
Querying: 100%|█████████████████████████████████████████████████████████████

In [9]:
import mlflow

In [13]:
experiment_name = "domain-adaption"  
s3_bucket = "s3://domain-qa-system/mlruns" 
mlflow.create_experiment(experiment_name, s3_bucket)
mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='s3://domain-qa-system/mlruns', creation_time=1667176158911, experiment_id='1', last_update_time=1667176158911, lifecycle_stage='active', name='domain-adaption', tags={}>

In [21]:
mlflow.get_artifact_uri()

's3://domain-qa-system/mlruns/988c7a38b52a4c029dceecc663489d73/artifacts'

In [14]:
with mlflow.start_run() as run:
    adapt.retriever.train(adapt.gpl_labels, n_epochs=2, batch_size=32)
    adapt.retriever.save(f'saved_models/{index}')
    params = {
        'document_score': adapt.document_store.__dict__.get('_component_config', {}).get('params', {}),
        'retriever': adapt.retriever.__dict__.get('_component_config', {}).get('params', {}),
        'question_generator': adapt.question_producer_params 
        'pseudo_label_generator': adapt.psg_params
    }
    mlflow.log_params(params)
    mlflow.log_artifacts('saved_models')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch:   0%|                                                                                                                                                                                          | 0/2 [00:00<?, ?it/s]
Iteration:   0%|                                                                                                                                                                                    | 0/285 [00:00<?, ?it/s][A
Iteration:   0%|▌                                                                                                                                                                           | 1/285 [00:00<02:29,  1.90it/s][A
Iteration:   1%|█▏                                                                                                                                                                          | 2/285 [00:00<02:18,  2.04it/s][A
Iteration:   1%|█▊                                                                                         

### Evaluate Retriever performance

In [18]:
adapt.retriever.eval()

0it [00:00, ?it/s]


ZeroDivisionError: division by zero