### Setup Elasticsearch cluster

In [None]:
import os
import time
get_ipython().system = os.system

In [None]:
# download Elasticsearch binaries into downloads folder
# <YOUR PASSWORD> is your sudo password
!mkdir ../downloads
!wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-oss-7.9.2-linux-x86_64.tar.gz -P ../downloads
!wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-oss-7.9.2-linux-x86_64.tar.gz.sha512 -P ../downloads
!tar -xzf ../downloads/elasticsearch-oss-7.9.2-linux-x86_64.tar.gz -C ../downloads
!echo "<YOUR PASSWORD>" | sudo chown -R daemon:daemon ../downloads/elasticsearch-7.9.2/
!shasum -a 512 -c ../downloads/elasticsearch-oss-7.9.2-linux-x86_64.tar.gz.sha512

In [None]:
# start Elasticsearch server as bg process
!echo "<YOUR PASSWORD>" | sudo -HSu daemon ../downloads/elasticsearch-7.9.2/bin/elasticsearch &

In [None]:
# check the daemon process status
!!ps -ef | grep elasticsearch

In [None]:
port = 9200
host = f'http://localhost:{port}'

In [None]:
# check the cluster started up correctly
time.sleep(30)
!!curl -s {host}

## Build a search engine

In [None]:
import elasticsearch as es
import elasticsearch.helpers as helpers
from uuid import uuid4
from datetime import datetime as dt
import json

In [None]:
class SearchEngine:
    def __init__(self, host):
        self.client = es.Elasticsearch(host)
        ping = self.client.ping()
        if not ping:
            raise Exception('Error: could not connect to cluster')
        print('Ok: cluster is up')
    
    def cluster_info(self):
        print(json.dumps(self.client.info(), indent = 2))
    
    def create_index(self, index_name, sim_module, mappings):
        request_body = {
            'settings': {
                'number_of_shards': 1,
                'number_of_replicas': 1,
                'similarity' : sim_module
            }
        }
        request_body.update(mappings)
        print(request_body)
        if self.client.indices.exists(index_name):
            raise Exception(f'Error: index {index_name} exists.')
        self.client.indices.create(index_name, body = request_body, ignore = 400)
        print(f'OK: index {index_name} created.')
        
    def do_index(self, docs, _index, _doc):
        def bulk(docs, _index, _doc):
            for doc in docs:
                action = {
                    "_index": _index,
                    "_doc": _doc,
                    "_id": uuid4(),
                    "_source": {
                        key : value 
                        for key, value in doc.items()
                    }
                }
                yield action
        
        try:
            res = helpers.bulk(self.client, bulk(docs, _index, _doc))
            succ, fail = res
            print(f'Ok: success: {succ}; fail: {fail}')
        except Exception as e:
            print(str(e))
    
    def update_ranking_model(self, index_name, sim_module):
        name = [*sim_module.keys()][0]
        settings = { 
            'settings' : {
                'index' : {
                    'similarity' : sim_module
                }
            }
        }
        
        self.client.indices.close(index = index_name)
        self.client.indices.put_settings(index = index_name, body = settings)
        self.client.indices.open(index = index_name)
        
        model_type = sim_module[name]['type'].lower()
        base_index_name = index_name.split('_')[-1]
        new_index_name = f'{model_type}_{base_index_name}'
                
        if self._re_index(index_name, new_index_name):
            # delete old index
            self.client.indices.delete(index = index_name)
            if self._update_alias(new_index_name, base_index_name):                
                print(f"Index {index_name} updated with ranking model {model_type}")
        else:
            print(f"Failed to update {index_name} with ranking model {model_type}")
    
    def _re_index(self, index_name, new_index_name):
        # reindexes the old index with a new name
        res = self.client.reindex({
            'source' : {
                'index' : index_name
            },
            'dest' : {
                'index' : new_index_name
            }
        })['total'] > 0
        return res
    
    def _update_alias(self, index_name, alias):
        # creates alias with old index name to new index name
        # so we can keep using the old index name
        # e.g., history -> dfr_history
        res = bool(self.client.indices.update_aliases(body = {
            'actions' : [{
                'add'  : {
                    'index' : index_name,
                    'alias' : alias
                }
            }]
        })['acknowledged'])
        return res
    
    def index_info(self, index_name = None):
        if index_name:
            info = json.dumps(self.client.cat.indices(format = 'json', index = index_name), indent = 2)
        else:
            info = json.dumps(self.client.cat.indices(format = 'json'), indent = 2)
        print(info)
            
    def _convert_to_date(self, field):
        try:
            date = dt.strptime(field, '%Y-%m-%d %H:%M:%S')
            return date
        except:
            return field
            
    def extract_mappings(self, sample):
        sample_ = sample.copy()
        sanitised_vals = [*map(self._convert_to_date, list(sample_.values()))]
        sample_.update(
            (field, val) 
            for field, val in zip(
                sample_.keys(), sanitised_vals
            )
        )   
        print(sample_)
        types = {
            'int'      : 'integer',
            'str'      : 'text',
            'datetime' : 'date'
        }
        return {
            'mapping' : {
                '_source' : {
                    'enabled' : 'true'
                },
                'properties' : {
                    property_ : { 
                        'type' : types[type(property_val).__name__] 
                    }
                    for property_, property_val in sample_.items()
                }
            }
        }
    
    def query(self, index, body = {"query": {"match_all": {}}}):
        return self.client.search(body = body, index = index, explain = True)
    
    def get_hits(self, results, *fields, explain = True):
        print(fields)
        return {
            'count' : results['hits']['total']['value'],
            'hits' : {
                hit['_id'] : {
                    **({ field : hit['_source'][field] for field in fields}),
                    'score': hit['_score'],
                    **({ 'explanation' : hit['_explanation'] } if explain else {})
                } 
                for hit in results['hits']['hits']
            }
        }
    
    
    

In [None]:
# mocks dataset from aggregator
docs = json.loads("""
[
  {
    "id": 12181,
    "url": "https://www.google.com/search?q=sqlite+chrome+history&oq=sqlite+chrome+&aqs=chrome.3.69i57j0i512l3j0i22i30l6.3022j0j4&sourceid=chrome&ie=UTF-8",
    "title": "sqlite chrome history - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:50:14",
    "visit_date": "2023-03-19 21:50:13",
    "from_visit": 0,
    "visit_type": "generated",
    "browser": "Chrome"
  },
  {
    "id": 12181,
    "url": "https://www.google.com/search?q=sqlite+chrome+history&oq=sqlite+chrome+&aqs=chrome.3.69i57j0i512l3j0i22i30l6.3022j0j4&sourceid=chrome&ie=UTF-8",
    "title": "sqlite chrome history - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:50:14",
    "visit_date": "2023-03-19 21:50:14",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12182,
    "url": "https://en.wikiversity.org/wiki/Chromium_browsing_history_database",
    "title": "Chromium browsing history database - Wikiversity",
    "visit_count": 1,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:50:26",
    "visit_date": "2023-03-19 21:50:26",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12183,
    "url": "https://www.researchgate.net/figure/Chrome-history-SQLite-The-highlighted-record-corresponds-to-a-bookmark-added-in-the_fig1_262880203",
    "title": "Chrome history SQLite. The highlighted record corresponds to a bookmark... | Download Scientific Diagram",
    "visit_count": 1,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:50:27",
    "visit_date": "2023-03-19 21:50:27",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12191,
    "url": "https://github.com/tomasraposo/ir-search-engine/blob/714f37b9808718ebae220c8f64e7e83070d0117e/src/aggregator.ipynb",
    "title": "ir-search-engine/aggregator.ipynb at 714f37b9808718ebae220c8f64e7e83070d0117e \u00b7 tomasraposo/ir-search-engine",
    "visit_count": 3,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:19:42",
    "visit_date": "2023-03-19 21:55:12",
    "from_visit": 25,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12191,
    "url": "https://github.com/tomasraposo/ir-search-engine/blob/714f37b9808718ebae220c8f64e7e83070d0117e/src/aggregator.ipynb",
    "title": "ir-search-engine/aggregator.ipynb at 714f37b9808718ebae220c8f64e7e83070d0117e \u00b7 tomasraposo/ir-search-engine",
    "visit_count": 3,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:19:42",
    "visit_date": "2023-03-19 21:55:12",
    "from_visit": 27,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12192,
    "url": "https://www.google.com/search?q=firefox+host&oq=firefox+host&aqs=chrome..69i57j0i512l7j0i22i30l2.3143j0j7&sourceid=chrome&ie=UTF-8",
    "title": "firefox host - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:57:08",
    "visit_date": "2023-03-19 21:57:07",
    "from_visit": 0,
    "visit_type": "generated",
    "browser": "Chrome"
  },
  {
    "id": 12192,
    "url": "https://www.google.com/search?q=firefox+host&oq=firefox+host&aqs=chrome..69i57j0i512l7j0i22i30l2.3143j0j7&sourceid=chrome&ie=UTF-8",
    "title": "firefox host - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:57:08",
    "visit_date": "2023-03-19 21:57:08",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12194,
    "url": "https://www.google.com/search?q=firefox+sqlite+datbase+schema&oq=firefox+sqlite+datbase+schema&aqs=chrome..69i57j33i10i160j33i10i22i29i30l5j33i10i15i22i29i30.4066j0j7&sourceid=chrome&ie=UTF-8",
    "title": "firefox sqlite datbase schema - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:57:51",
    "visit_date": "2023-03-19 21:57:50",
    "from_visit": 0,
    "visit_type": "generated",
    "browser": "Chrome"
  },
  {
    "id": 12194,
    "url": "https://www.google.com/search?q=firefox+sqlite+datbase+schema&oq=firefox+sqlite+datbase+schema&aqs=chrome..69i57j33i10i160j33i10i22i29i30l5j33i10i15i22i29i30.4066j0j7&sourceid=chrome&ie=UTF-8",
    "title": "firefox sqlite datbase schema - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:57:51",
    "visit_date": "2023-03-19 21:57:51",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12195,
    "url": "https://wiki.mozilla.org/File:Places.sqlite.schema.pdf",
    "title": "File:Places.sqlite.schema.pdf - MozillaWiki",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:58:21",
    "visit_date": "2023-03-19 21:57:54",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12196,
    "url": "https://mozilla.github.io/firefox-browser-architecture/text/0010-firefox-data-stores.html",
    "title": "Firefox Data Stores",
    "visit_count": 1,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:58:02",
    "visit_date": "2023-03-19 21:58:02",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12197,
    "url": "https://www.google.com/search?q=firefox+sqlite+history+schemas&ei=XoUXZNfqO-P0qwHF5pSQDw&ved=0ahUKEwjXkdK2_uj9AhVj-ioKHUUzBfIQ4dUDCA8&uact=5&oq=firefox+sqlite+history+schemas&gs_lcp=Cgxnd3Mtd2l6LXNlcnAQAzIICCEQoAEQwwQyCAghEKABEMMEMggIIRCgARDDBDoICAAQhgMQsAM6BAgAEB46BggAEAgQHjoFCAAQhgM6CgghEKABEMMEEApKBAhBGAFQxQJYwhZg0BdoAXAAeACAAb0CiAGvC5IBBzAuNi4xLjGYAQCgAQHIAQTAAQE&sclient=gws-wiz-serp",
    "title": "firefox sqlite history schemas - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:58:21",
    "visit_date": "2023-03-19 21:58:20",
    "from_visit": 34,
    "visit_type": "submit",
    "browser": "Chrome"
  },
  {
    "id": 12197,
    "url": "https://www.google.com/search?q=firefox+sqlite+history+schemas&ei=XoUXZNfqO-P0qwHF5pSQDw&ved=0ahUKEwjXkdK2_uj9AhVj-ioKHUUzBfIQ4dUDCA8&uact=5&oq=firefox+sqlite+history+schemas&gs_lcp=Cgxnd3Mtd2l6LXNlcnAQAzIICCEQoAEQwwQyCAghEKABEMMEMggIIRCgARDDBDoICAAQhgMQsAM6BAgAEB46BggAEAgQHjoFCAAQhgM6CgghEKABEMMEEApKBAhBGAFQxQJYwhZg0BdoAXAAeACAAb0CiAGvC5IBBzAuNi4xLjGYAQCgAQHIAQTAAQE&sclient=gws-wiz-serp",
    "title": "firefox sqlite history schemas - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:58:21",
    "visit_date": "2023-03-19 21:58:21",
    "from_visit": 37,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12195,
    "url": "https://wiki.mozilla.org/File:Places.sqlite.schema.pdf",
    "title": "File:Places.sqlite.schema.pdf - MozillaWiki",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:58:21",
    "visit_date": "2023-03-19 21:58:21",
    "from_visit": 38,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12198,
    "url": "https://wiki.mozilla.org/images/0/08/Places.sqlite.schema.pdf",
    "title": "Places.sqlite.schema.pdf",
    "visit_count": 1,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:58:24",
    "visit_date": "2023-03-19 21:58:24",
    "from_visit": 39,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12199,
    "url": "https://www.google.com/search?q=moz_places_metadata&oq=moz_places_metadata&aqs=chrome..69i57.4730j0j7&sourceid=chrome&ie=UTF-8",
    "title": "moz_places_metadata - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:04:02",
    "visit_date": "2023-03-19 22:04:01",
    "from_visit": 0,
    "visit_type": "generated",
    "browser": "Chrome"
  },
  {
    "id": 12199,
    "url": "https://www.google.com/search?q=moz_places_metadata&oq=moz_places_metadata&aqs=chrome..69i57.4730j0j7&sourceid=chrome&ie=UTF-8",
    "title": "moz_places_metadata - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:04:02",
    "visit_date": "2023-03-19 22:04:02",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12200,
    "url": "https://raw.githubusercontent.com/mozilla/gecko-dev/master/toolkit/components/places/nsPlacesIndexes.h",
    "title": "",
    "visit_count": 1,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:04:14",
    "visit_date": "2023-03-19 22:04:14",
    "from_visit": 42,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12201,
    "url": "https://www.google.com/search?q=moz_places_metadata&oq=moz_places_metadata&aqs=chrome.0.69i59.2125j0j7&sourceid=chrome&ie=UTF-8",
    "title": "moz_places_metadata - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:05:04",
    "visit_date": "2023-03-19 22:05:04",
    "from_visit": 0,
    "visit_type": "generated",
    "browser": "Chrome"
  },
  {
    "id": 12201,
    "url": "https://www.google.com/search?q=moz_places_metadata&oq=moz_places_metadata&aqs=chrome.0.69i59.2125j0j7&sourceid=chrome&ie=UTF-8",
    "title": "moz_places_metadata - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:05:04",
    "visit_date": "2023-03-19 22:05:04",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12191,
    "url": "https://github.com/tomasraposo/ir-search-engine/blob/714f37b9808718ebae220c8f64e7e83070d0117e/src/aggregator.ipynb",
    "title": "ir-search-engine/aggregator.ipynb at 714f37b9808718ebae220c8f64e7e83070d0117e \u00b7 tomasraposo/ir-search-engine",
    "visit_count": 3,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:19:42",
    "visit_date": "2023-03-19 22:19:42",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12216,
    "url": "https://github.com/tomasraposo/ir-search-engine",
    "title": "tomasraposo/ir-search-engine",
    "visit_count": 1,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:19:44",
    "visit_date": "2023-03-19 22:19:44",
    "from_visit": 69,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12217,
    "url": "https://github.com/tomasraposo/ir-search-engine/tree/aggregator",
    "title": "tomasraposo/ir-search-engine at aggregator",
    "visit_count": 1,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:20:08",
    "visit_date": "2023-03-19 22:20:08",
    "from_visit": 70,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 1,
    "url": "https://www.google.com/search?channel=fs&client=ubuntu&q=mozilla+sqlite+schemas+",
    "title": "mozilla sqlite schemas - Google Search",
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:18:51",
    "visit_date": "2023-03-19 21:18:51",
    "from_visit": 0,
    "visit_type": "typed",
    "browser": "Firefox"
  },
  {
    "id": 2,
    "url": "https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&ved=2ahUKEwjus4rb9ej9AhUKt4sKHQ91AZUQFnoECA0QAQ&url=https%3A%2F%2Fwiki.mozilla.org%2Fimages%2F0%2F08%2FPlaces.sqlite.schema.pdf&usg=AOvVaw1VqHh-NQHUFYqoK6-DldIH",
    "title": null,
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:18:55",
    "visit_date": "2023-03-19 21:18:55",
    "from_visit": 1,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 3,
    "url": "https://wiki.mozilla.org/images/0/08/Places.sqlite.schema.pdf",
    "title": "Places.sqlite.schema.pdf",
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:18:56",
    "visit_date": "2023-03-19 21:18:56",
    "from_visit": 2,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 4,
    "url": "https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&ved=2ahUKEwjus4rb9ej9AhUKt4sKHQ91AZUQFnoECA4QAQ&url=https%3A%2F%2Fwiki.mozilla.org%2Fimages%2F7%2F72%2FContent-prefs.sqlite.schema.pdf&usg=AOvVaw2xp8uTcWWZhEur4dMUmp4v",
    "title": null,
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:18:59",
    "visit_date": "2023-03-19 21:18:59",
    "from_visit": 1,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 5,
    "url": "https://wiki.mozilla.org/images/7/72/Content-prefs.sqlite.schema.pdf",
    "title": "Content-prefs.sqlite.schema.pdf",
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:19:01",
    "visit_date": "2023-03-19 21:19:01",
    "from_visit": 4,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 6,
    "url": "https://www.google.com/search?q=sqlite+documentation&client=ubuntu&hs=qFP&channel=fs&ei=O3wXZK6qHYrurgSP6oWoCQ&ved=0ahUKEwjus4rb9ej9AhUKt4sKHQ91AZUQ4dUDCGo&uact=5&oq=sqlite+documentation&gs_lcp=Cgxnd3Mtd2l6LXNlcnAQAzIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQyBQgAEIAEMgUIABCABDIGCAAQFhAeMgYIABAWEB4yBggAEBYQHjIGCAAQFhAeOgoIABBHENYEELADOgQIABBDOgUIABCRAjoLCC4QgAQQxwEQ0QM6BQgAEIYDSgQIQRgAUJsJWPcXYPYaaANwAXgAgAGVAYgB-BCSAQQ1LjE1mAEAoAEByAECwAEB&sclient=gws-wiz-serp",
    "title": "sqlite documentation - Google Search",
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:19:07",
    "visit_date": "2023-03-19 21:19:07",
    "from_visit": 1,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 7,
    "url": "file:///home/tomasraposo/.local/share/jupyter/runtime/nbserver-962520-open.html",
    "title": "Opening Jupyter Notebook",
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:19:11",
    "visit_date": "2023-03-19 21:19:11",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 8,
    "url": "http://localhost:8888/tree?token=1de4774f1cd881f1ed29059dd80fc03ec3f54e40761b2f0c",
    "title": "Home Page - Select or create a notebook",
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:19:12",
    "visit_date": "2023-03-19 21:19:12",
    "from_visit": 7,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 9,
    "url": "http://localhost:8888/tree",
    "title": "Home Page - Select or create a notebook",
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:19:12",
    "visit_date": "2023-03-19 21:19:12",
    "from_visit": 8,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 10,
    "url": "http://localhost:8888/notebooks/aggregator.ipynb",
    "title": "aggregator - Jupyter Notebook",
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:19:26",
    "visit_date": "2023-03-19 21:19:26",
    "from_visit": 9,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 11,
    "url": "http://localhost:8888/notebooks/aggregator.ipynb#",
    "title": "aggregator - Jupyter Notebook",
    "visit_count": 2,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:27:19",
    "visit_date": "2023-03-19 21:20:25",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 14,
    "url": "http://localhost:8888/notebooks/aggregator.ipynb#",
    "title": "aggregator - Jupyter Notebook",
    "visit_count": 2,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:27:19",
    "visit_date": "2023-03-19 21:27:19",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 15,
    "url": "http://localhost:8888/notebooks/Untitled1.ipynb?kernel_name=python3",
    "title": "Untitled1 - Jupyter Notebook",
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:27:20",
    "visit_date": "2023-03-19 21:27:20",
    "from_visit": 14,
    "visit_type": "link",
    "browser": "Firefox"
  }
]
""")
print(f'Total number of docs: {len(docs)}')

In [None]:
# instantiate the search engine
try:
    se = SearchEngine(host)
except Exception as e:
    print(str(e))

In [None]:
se.cluster_info()

In [None]:
# you can manually delete an index or all - * - if you feel you messed somewhere
se.client.indices.delete(index = '*')

## Ranking models
_BM25 similarity (default)_

Note: we're required to explain how ranking works for each of the models used, i.e. how it reflects in the documents returned.


### BM25 (Best Match Okapi)
This is the default ranking model used by Elasticsearch

In [None]:
sim_bm25  = {
    'sim_bm25' : {
        'type' : 'BM25',
        'b' : '0.75',
        'k1' : 1.2
    }
}

In [None]:
# get document sample
sample_doc = docs[0]
        
# extract mappings from document sample
mappings = se.extract_mappings(sample_doc)

# create index
index_name = 'history'
_doc = 'browser_history'
try:
    se.create_index(index_name, sim_bm25, mappings)
    # bulk index docs
    se.do_index(docs, index_name, _doc)        
except:
    pass

se.index_info()

In [None]:
# get all records
q = se.query(index_name)
print(json.dumps(q, indent = 2))

In [None]:
# search for `sqlite` in  title
query = {
    'query' : {
        'term'  : {
            'title' : 'sqlite'
        } 
    } 
}

res = se.query(index_name, body = query)
hits = se.get_hits(res, *['url', 'title', 'visit_date', 'last_visit_date'], explain = False)
print(json.dumps(hits, indent = 2))

In [None]:
# the bool clause allows us to build boolean expressions
# `should` behaves like an OR clause whereas `must` behaves like an AND clause
# this can be used across many fields

# search for either keyword in title
keywords = 'sqlite documentation history'
query = {
    'query' : {
        'bool' : {
            'should' : [
                {
                    'terms' : {
                        'title' : keywords.split(' ')
                    }
                }
            ]
        }
    }
}
res = se.query(index_name, body = query)
hits = se.get_hits(res, *['url', 'title', 'visit_date', 'last_visit_date'], explain = False)
print(json.dumps(hits, indent = 2))

In [None]:
# search for documents whose `last_visit_date` is more recent than March 29 and contain
# either history on the `title` or google somewhere in the URL
# note: /google/ is a regex pattern.
query = {
    'query' : {
        'bool' : {
            'must' : [
                {
                    'range' : {
                        'last_visit_date' : {
                            'gte' : '2023-29-03'
                        }
                    }
                }
            ],
            'should' : [
                {
                    'match'  : { 
                        'title' : 'history'
                    },
                    'match' : {
                        'url' :  '/google/'
                    }
                }
            ]
        }
    }
}
res = se.query(index_name, body = query)
hits = se.get_hits(res, *['url', 'title', 'visit_date', 'last_visit_date'], explain = False)
print(json.dumps(hits, indent = 2))

### DFR (Divergence from Randomness)
This model takes into account statistical properties of the collection, e.g. frequency and distribution of terms within the collection, length of documents, etc.  

In [None]:
sim_dfr = {
    "sim_dfr": {
      "type": "DFR",
      "basic_model": "g",
      "after_effect": "l",
      "normalization": "h2",
      "normalization.h2.c": "3.0"
    }
}
se.update_ranking_model(index_name, sim_dfr)

In [None]:
se.index_info()

### BM25F
This can be achieved using a `multi_match` query, which allows us to assign different weights to each field.

In [None]:
sim_bm25f = {
    "sim_bm25f": {
        'type' : 'BM25',
        'b' : '0.75',
        'k1' : 1.2
    }
}
# we have to update using the actual index name not the alias
# this is a known issue in Elasticsearch
# you can check the index name with `se.index_info()` in the cell above
se.update_ranking_model(f'{index_name}', sim_bm25f)

In [None]:
se.index_info()

In [None]:
# assign weights to fields
fields = [
    f'{field}^{str(weight)}' for field, weight in zip(
        ['title', 'url', 'browser', 'last_visit_date'], 
        [3, 2, 1, 3]
    )
]

In [None]:
# `combined_fields` was introduced in v7.13 and truly implements BM25F
# as we running an old version (as provided by the lecturer), 
# we have to either use `multi_match` or bump the version (not sure we can)
# see: https://opensourceconnections.com/blog/2021/06/30/better-term-centric-scoring-in-elasticsearch-with-bm25f-and-the-combined_fields-query/

query = {
    'query' : {
        'multi_match' : {
            'query' : 'sqlite',
            'fields' : fields,
            'type' : 'cross_fields'
        }
    }
}

q1 = se.query(index_name, body = query)
hits = se.get_hits(q1, *['url', 'title', 'visit_date', 'last_visit_date'], explain = False)
print(json.dumps(hits, indent = 2))

### User Interface