### **01_index_dataset.ipynb**
### **Download BBC News dataset and index it in Elasticsearch**

* ##### 01 - Install packages
* ##### 02 - Import packages
* ##### 03 - Download Multilingual Universal Sentence Encoder model
* ##### 04 - Download BBC News dataset
* ##### 05 - Create Elasticsearch client
* ##### 06 - Create BBC News index in Elasticsearch
* ##### 07 - Index BBC News dataset in Elasticsearch

### 01 - Install packages

In [None]:
import sys

In [None]:
!{ sys.executable } -m pip install --upgrade datasets elasticsearch ipython tensorflow tensorflow-hub tensorflow-text urllib3

### 02 - Import packages

In [None]:
import tensorflow_text

from datasets       import load_dataset
from elasticsearch  import Elasticsearch
from IPython        import display
from tensorflow_hub import load

In [None]:
from urllib3 import disable_warnings
disable_warnings()

### 03 - Download Multilingual Universal Sentence Encoder model

In [None]:
model = load('https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/multilingual-large/versions/2')

In [None]:
model('Hello World, ML Elasticsearch!')[0].numpy()

### 04 - Download BBC News dataset

In [None]:
bbc_news_dataset = load_dataset('SetFit/bbc-news')

In [None]:
bbc_news_dataset

### 05 - Create Elasticsearch client

In [None]:
es_host     = '<elasticsearch_host>'
es_username = '<elasticsearch_username>'
es_password = '<elasticsearch_password>'

In [None]:
es = Elasticsearch(
    hosts        = es_host,
    basic_auth   = (es_username, es_password),
    verify_certs = False
)

In [None]:
es.info()

### 06 - Create BBC News index in Elasticsearch

In [None]:
bbc_news_index = 'bbc_news'

In [None]:
es.indices.create(
    index    = bbc_news_index,
    settings = {
        'number_of_shards'   : 2,
        'number_of_replicas' : 1
    },
    mappings = {
        'properties' : {
            'text'     : { 'type' : 'text' },
            'vector'   : { 'type' : 'dense_vector', 'dims' : 512, 'index' : True },
            'metadata' : {
                'properties' : {
                    'label'        : { 'type' : 'integer' },
                    'label_text'   : { 'type' : 'text' },
                    'dataset_type' : { 'type' : 'text' }
                }
            }
        }
    }
)

### 07 - Index BBC News dataset in Elasticsearch

In [None]:
for dataset_type in bbc_news_dataset:

    dataset = bbc_news_dataset[dataset_type]
    size    = len(dataset)

    for index, item in enumerate(dataset, start = 1):

        display.clear_output(wait = True)
        print(f'Indexing BBC News { dataset_type } dataset : { index } / { size }')

        document = {
            'text'     : item['text'],
            'vector'   : model(item['text'])[0].numpy(),
            'metadata' : {
                'label'        : item['label'],
                'label_text'   : item['label_text'],
                'dataset_type' : dataset_type
            }
        }

        es.index(index = bbc_news_index, document = document)

In [None]:
es.indices.refresh(index = bbc_news_index)

In [None]:
es.count(index = bbc_news_index)