### Setup Elasticsearch cluster

In [None]:
import os
import time
get_ipython().system = os.system

In [None]:
# download Elasticsearch binaries into downloads folder
# <YOUR PASSWORD> is your sudo password
!mkdir ../downloads
!wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-oss-7.9.2-linux-x86_64.tar.gz -P ../downloads
!wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-oss-7.9.2-linux-x86_64.tar.gz.sha512 -P ../downloads
!tar -xzf ../downloads/elasticsearch-oss-7.9.2-linux-x86_64.tar.gz -C ../downloads
!echo "<YOUR PASSWORD>" | sudo chown -R daemon:daemon ../downloads/elasticsearch-7.9.2/
!shasum -a 512 -c ../downloads/elasticsearch-oss-7.9.2-linux-x86_64.tar.gz.sha512

In [None]:
# start Elasticsearch server as bg process
!echo "<YOUR PASSWORD>" | sudo -HSu daemon ../downloads/elasticsearch-7.9.2/bin/elasticsearch &

In [None]:
# check the daemon process status
!!ps -ef | grep elasticsearch

In [1]:
port = 9200
host = f'http://localhost:{port}'

In [None]:
# check the cluster started up correctly
time.sleep(30)
!!curl -s {host}

## Collect browser history

In [7]:
from os import path
import os
import glob
import platform
import shutil
import tempfile
import sqlite3
import re
import json

#### Base aggregator
Every browser implements the base class as except-chromimum based ones they differ in implementation

In [None]:
class Aggregator:
    def __init__(self):
        self.browsing_history = None
        
    def __enter__(self):
        # scan browsers internal history dbs
        curr_os = platform.system()
        home_dir = path.expanduser('~')
        oss_bin_paths = {
            'Linux'  : ('/', 'usr', 'bin'),
            'Darwin' : ('/', 'Applications'),
            'Windows': ('C:/', 'Program Files')
        }
        # /usr/bin: symlinked from /usr/lib
        install_dirs = {
            os : path.join(*path_comps) 
            for os, path_comps in oss_bin_paths.items()
        }
        browsers_data_stores = {
            'Chrome' : ('.config', 'google-chrome', 'Default', 'History'),
            'Firefox' : ('.mozilla', 'firefox', '*.default', 'places.sqlite')
        }
        browsers = {
            browser : path.join(home_dir, *path_comps) 
            for browser, path_comps in browsers_data_stores.items()
        }        
        # store both locked and lock-free db copies
        self.db_files = {
            browser : {
                file_t : {} for file_t in ['orig', 'tmp']
            }
            for browser in browsers
        }
        
        for browser in browsers:
            found = glob.glob(f'{install_dirs[curr_os]}/*{browser.lower()}*') is not None
            if found:
                orig_file = glob.glob(browsers[browser])
                tmp_file = self._tmp_copy(*orig_file)
                self.db_files[browser]['orig'] = orig_file
                self.db_files[browser]['tmp'] = tmp_file
                
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        # delete tmp copies on exit
        for _, files in self.db_files.items():
            os.remove(files['tmp'])
                    
    def _tmp_copy(self, original_file):
        tmp = tempfile.gettempdir()
        filename = path.basename(original_file)
        tmp_file = path.join(tmp, filename)
        shutil.copy2(original_file, tmp_file)
        return tmp_file
        
    @classmethod
    def _regexp(cls, regex, field_val):
        # base REGEXP implementation
        return bool(re.search(regex, field_val))
    
    def _extract(self, row):
        return row[0]
    
    def _get_history_tables(self, conn, cursor):
        conn.create_function("REGEXP", 2, Aggregator._regexp)
        regex = '.*(history|visit).*'
        query = cursor.execute("""
            SELECT name FROM sqlite_master 
            WHERE type="table" and name REGEXP ?
        """, [regex])
        rows = query.fetchall()
        rows = [*map(self._extract, rows)]
        return rows
   
    def _get_fields(self, table):
        # looksup a table's columns
        query = self.cursor.execute(f'SELECT * FROM {table}')
        fields = [*map(self._extract, query.description)]
        return fields
            
    def _to_dict(self, row, fields):
        # converts row to dict by mapping columns to values
        dict_ = {}
        for i, val in enumerate(row):
            field = self._extract(fields[i])
            if field == 'visit_type':
                val = self._visit_type(val)
                val = val.split('_')[-1].lower()
            dict_[field] = val
        return dict_
    
    def _to_json(self, cursor):
        fields = cursor.description
        rows = cursor.fetchall()
        rows = [ self._to_dict(row, fields) for row in rows ]
        # rows_as_json = json.dumps(rows, indent = 2)
        return rows
    
    def merge(self, history):
        if not self.browsing_history:
            self.browsing_history = history
        else:
            self.browsing_history += history
                                 
    def get_history_as_json(self):
        # returns browser history as list of json documents
        raise NotImplementedError
    
    def save(self):
        if self.browsing_history:
            file_path = os.path.join('../dataset', 'browsing_history.json')
            with open(file_path, 'w') as file:
                json.dump(self.browsing_history, file, indent = 2)
                

In [None]:
class AggregatorFirefox(Aggregator):
    def __init__(self, db_file):
        # raises sqlite3.OperationalError: unable to open database file
        self.conn = sqlite3.connect(f'file:{db_file}?mode=ro', uri = True)
        self.cursor = self.conn.cursor()
        sqlite3.enable_callback_tracebacks(True)
            
    def _visit_type(self, enum):
        # maps visit types to their written rep for ease of querying
        return [
            'TRANSITION_LINK',
            'TRANSITION_TYPED',
            'TRANSITION_BOOKMARK',
            'TRANSITION_EMBED',
            'TRANSITION_REDIRECT_PERMANENT',
            'TRANSITION_REDIRECT_TEMPORARY',
            'TRANSITION_DOWNLOAD',
            'TRANSITION_FRAMED_LINK',
            'TRANSITION_RELOAD'
        ][enum - 1]

    def get_history_as_json(self):
        # returns browser history as list of json documents
        self.cursor.execute("""
            SELECT moz_historyvisits.id,
                   moz_places.url, 
                   moz_places.title, 
                   moz_places.visit_count,
                   (
                       SELECT printf("%d", total(use_count))
                       FROM moz_inputhistory 
                       WHERE moz_inputhistory.place_id = moz_places.id
                   ) typed_count,
                   DATETIME(moz_places.last_visit_date/1000000,'unixepoch') as last_visit_date, 
                   DATETIME(moz_historyvisits.visit_date/1000000, 'unixepoch') as visit_date, 
                   moz_historyvisits.from_visit,
                   moz_historyvisits.visit_type,
                   'Firefox' as browser
            FROM moz_places, moz_historyvisits 
            WHERE moz_historyvisits.place_id = moz_places.id;
        """)
        rows_as_json = self._to_json(self.cursor)
        return rows_as_json

In [None]:
class AggregatorChrome(Aggregator):
    def __init__(self, db_file):
        # raises sqlite3.OperationalError: unable to open database file
        self.conn = sqlite3.connect(f'file:{db_file}?mode=ro', uri = True)
        self.cursor = self.conn.cursor()
        sqlite3.enable_callback_tracebacks(True)
         
    def _visit_type(self, enum):
        # maps visit types to their written rep for ease of querying
        CORE_MASK = 0xFF
        return [
            'LINK',
            'TYPED',
            'AUTO_BOOKMARK',
            'AUTO_SUBFRAME',
            'MANUAL_SUBFRAME',
            'GENERATED',
            'START_PAGE',
            'FORM_SUBMIT',
            'RELOAD',
            'KEYWORD',
            'KEYWORD_GENERATED'
        ][enum & CORE_MASK]

    def get_history_as_json(self):
        # returns browser history as list of json documents
        self.cursor.execute("""
            SELECT urls.id,
                   urls.url, 
                   urls.title, 
                   urls.visit_count, 
                   urls.typed_count, 
                   DATETIME(urls.last_visit_time / 1000000 + (strftime('%s', '1601-01-01')), 'unixepoch', 'localtime') as last_visit_date, 
                   DATETIME(visit_time / 1000000 + (strftime('%s', '1601-01-01')), 'unixepoch', 'localtime') as visit_date, 
                   visits.from_visit, 
                   visits.transition as visit_type,
                   'Chrome' as browser
            FROM urls, visits
            WHERE urls.id = visits.url
        """)
        rows_as_json = self._to_json(self.cursor)
        return rows_as_json

#### Extract browsers' history 

In [None]:
with Aggregator() as agg:
    db_files = agg.db_files
    for browser, AggregatorBrowser in list(zip(db_files, [AggregatorChrome, AggregatorFirefox])):
        db_file = db_files[browser]['tmp']
        print(f'\n{browser}: ', db_file, end="\n\n")
        try:
            agg_browser = AggregatorBrowser(db_file) 
            tables = agg_browser._get_history_tables(agg_browser.conn, agg_browser.cursor)

            for table in tables:
                fields = agg_browser._get_fields(table)
                print(table, fields, sep = "\n", end = "\n\n")

            json_docs = agg_browser.get_history_as_json()
            # print(json_docs)
            agg.merge(json_docs)
        except sqlite3.OperationalError as e:
            print(str(e))
    
    print(json.dumps(agg.browsing_history, indent = 2))
    
    agg.save()

## Build a search engine

In [2]:
import elasticsearch as es
import elasticsearch.helpers as helpers
from uuid import uuid4
from datetime import datetime as dt
import json
from os import path

In [3]:
class SearchEngine:
    def __init__(self, host):
        self.client = es.Elasticsearch(host)
        self.docs = self.read_browser_history()
        ping = self.client.ping()
        if not ping:
            raise Exception('Error: could not connect to cluster')
        print('Ok: cluster is up')
    
    def read_browser_history(self):
        docs_path = path.join('..', 'dataset', 'browsing_history.json')
        with open(docs_path, 'r') as f:
            docs = json.load(f)
            return docs

    def cluster_info(self):
        print(json.dumps(self.client.info(), indent = 2))
    
    def create_index(self, index_name, _doc, sim_module):
        # get document sample
        sample_doc = self.docs[0]
        # extract mappings from document sample
        mappings = self._extract_mappings(sample_doc)
        request_body = {
            'settings': {
                'number_of_shards': 1,
                'number_of_replicas': 1,
                'similarity' : sim_module
            }
        }
        request_body.update(mappings)
        # print(request_body)
        if self.client.indices.exists(index_name):
            raise Exception(f'Error: index {index_name} exists.')
        self.client.indices.create(index_name, body = request_body, ignore = 400)
        # bulk index docs
        self._do_index(self.docs, index_name, _doc)        
        print(f'OK: index {index_name} created.')
        
    def _do_index(self, docs, _index, _doc):
        def bulk(docs, _index, _doc):
            for i, doc in enumerate(docs):
                action = {
                    "_index": _index,
                    "_doc": _doc,
                    "_id": i,
                    "_source": {
                        key : value 
                        for key, value in doc.items()
                    }
                }
                yield action
        
        try:
            res = helpers.bulk(self.client, bulk(docs, _index, _doc))
            succ, fail = res
            print(f'Ok: success: {succ}; fail: {fail}')
        except Exception as e:
            print(str(e))
    
    def update_ranking_model(self, index_name, sim_module):
        name = [*sim_module.keys()][0]
        settings = { 
            'settings' : {
                'index' : {
                    'similarity' : sim_module
                }
            }
        }
        
        self.client.indices.close(index = index_name)
        self.client.indices.put_settings(index = index_name, body = settings)
        self.client.indices.open(index = index_name)
        
        model_type = sim_module[name]['type'].lower()
        base_index_name = index_name.split('_')[-1]
        new_index_name = f'{model_type}_{base_index_name}'
                
        if self._re_index(index_name, new_index_name):
            # delete old index
            self.client.indices.delete(index = index_name)
            if self._update_alias(new_index_name, base_index_name):                
                print(f"Index {index_name} updated with ranking model {model_type}")
        else:
            print(f"Failed to update {index_name} with ranking model {model_type}")
    
    def _re_index(self, index_name, new_index_name):
        # reindexes the old index with a new name
        res = self.client.reindex({
            'source' : {
                'index' : index_name
            },
            'dest' : {
                'index' : new_index_name
            }
        })['total'] > 0
        return res
    
    def _update_alias(self, index_name, alias):
        # creates alias with old index name to new index name
        # so we can keep using the old index name
        # e.g., history -> dfr_history
        res = bool(self.client.indices.update_aliases(body = {
            'actions' : [{
                'add'  : {
                    'index' : index_name,
                    'alias' : alias
                }
            }]
        })['acknowledged'])
        return res
    
    def index_info(self, index_name = None):
        if index_name:
            info = json.dumps(self.client.cat.indices(format = 'json', index = index_name), indent = 2)
        else:
            info = json.dumps(self.client.cat.indices(format = 'json'), indent = 2)
        print(info)
            
    def _convert_to_date(self, field):
        try:
            date = dt.strptime(field, '%Y-%m-%d %H:%M:%S')
            return date
        except:
            return field
            
    def _extract_mappings(self, sample):
        sample_ = sample.copy()
        sanitised_vals = [*map(self._convert_to_date, list(sample_.values()))]
        sample_.update(
            (field, val) 
            for field, val in zip(
                sample_.keys(), sanitised_vals
            )
        )   
        # print(sample_)
        types = {
            'int'      : 'integer',
            'str'      : 'text',
            'datetime' : 'date'
        }
        return {
            'mapping' : {
                '_source' : {
                    'enabled' : 'true'
                },
                'properties' : {
                    property_ : { 
                        'type' : types[type(property_val).__name__] 
                    }
                    for property_, property_val in sample_.items()
                }
            }
        }
    
    def query(self, index, body = {"query": {"match_all": {}}}):
        return self.client.search(body = body, index = index, explain = True)
    
    def get_hits(self, results, *fields, explain = True, fmt = 'json'):
        if fmt == 'json':
            return {
                'count' : results['hits']['total']['value'],
                'hits' : {
                    hit['_id'] : {
                        **({ field : hit['_source'][field] for field in fields}),
                        'score': hit['_score'],
                        **({ 'explanation' : hit['_explanation'] } if explain else {})
                    } 
                    for hit in results['hits']['hits']
                }
            }
        elif fmt == 'ascii':
            fmt_hits = []
            for hit in results['hits']['hits']:
                fmt_str = f"id: {hit['_id']}\n"
                for field in fields:
                    fmt_str += f"{field}: {hit['_source'][field]}\n"
                fmt_str += f"score: {hit['_score']}\n"
                fmt_hits.append(fmt_str)
            return '\n'.join(fmt_hits)
        else:
            raise ValueError(f'Error: unrecognised format {fmt}')
    
    

In [None]:
# mocks dataset from aggregator
docs = json.loads("""
[
  {
    "id": 12181,
    "url": "https://www.google.com/search?q=sqlite+chrome+history&oq=sqlite+chrome+&aqs=chrome.3.69i57j0i512l3j0i22i30l6.3022j0j4&sourceid=chrome&ie=UTF-8",
    "title": "sqlite chrome history - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:50:14",
    "visit_date": "2023-03-19 21:50:13",
    "from_visit": 0,
    "visit_type": "generated",
    "browser": "Chrome"
  },
  {
    "id": 12181,
    "url": "https://www.google.com/search?q=sqlite+chrome+history&oq=sqlite+chrome+&aqs=chrome.3.69i57j0i512l3j0i22i30l6.3022j0j4&sourceid=chrome&ie=UTF-8",
    "title": "sqlite chrome history - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:50:14",
    "visit_date": "2023-03-19 21:50:14",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12182,
    "url": "https://en.wikiversity.org/wiki/Chromium_browsing_history_database",
    "title": "Chromium browsing history database - Wikiversity",
    "visit_count": 1,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:50:26",
    "visit_date": "2023-03-19 21:50:26",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12183,
    "url": "https://www.researchgate.net/figure/Chrome-history-SQLite-The-highlighted-record-corresponds-to-a-bookmark-added-in-the_fig1_262880203",
    "title": "Chrome history SQLite. The highlighted record corresponds to a bookmark... | Download Scientific Diagram",
    "visit_count": 1,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:50:27",
    "visit_date": "2023-03-19 21:50:27",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12191,
    "url": "https://github.com/tomasraposo/ir-search-engine/blob/714f37b9808718ebae220c8f64e7e83070d0117e/src/aggregator.ipynb",
    "title": "ir-search-engine/aggregator.ipynb at 714f37b9808718ebae220c8f64e7e83070d0117e \u00b7 tomasraposo/ir-search-engine",
    "visit_count": 3,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:19:42",
    "visit_date": "2023-03-19 21:55:12",
    "from_visit": 25,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12191,
    "url": "https://github.com/tomasraposo/ir-search-engine/blob/714f37b9808718ebae220c8f64e7e83070d0117e/src/aggregator.ipynb",
    "title": "ir-search-engine/aggregator.ipynb at 714f37b9808718ebae220c8f64e7e83070d0117e \u00b7 tomasraposo/ir-search-engine",
    "visit_count": 3,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:19:42",
    "visit_date": "2023-03-19 21:55:12",
    "from_visit": 27,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12192,
    "url": "https://www.google.com/search?q=firefox+host&oq=firefox+host&aqs=chrome..69i57j0i512l7j0i22i30l2.3143j0j7&sourceid=chrome&ie=UTF-8",
    "title": "firefox host - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:57:08",
    "visit_date": "2023-03-19 21:57:07",
    "from_visit": 0,
    "visit_type": "generated",
    "browser": "Chrome"
  },
  {
    "id": 12192,
    "url": "https://www.google.com/search?q=firefox+host&oq=firefox+host&aqs=chrome..69i57j0i512l7j0i22i30l2.3143j0j7&sourceid=chrome&ie=UTF-8",
    "title": "firefox host - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:57:08",
    "visit_date": "2023-03-19 21:57:08",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12194,
    "url": "https://www.google.com/search?q=firefox+sqlite+datbase+schema&oq=firefox+sqlite+datbase+schema&aqs=chrome..69i57j33i10i160j33i10i22i29i30l5j33i10i15i22i29i30.4066j0j7&sourceid=chrome&ie=UTF-8",
    "title": "firefox sqlite datbase schema - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:57:51",
    "visit_date": "2023-03-19 21:57:50",
    "from_visit": 0,
    "visit_type": "generated",
    "browser": "Chrome"
  },
  {
    "id": 12194,
    "url": "https://www.google.com/search?q=firefox+sqlite+datbase+schema&oq=firefox+sqlite+datbase+schema&aqs=chrome..69i57j33i10i160j33i10i22i29i30l5j33i10i15i22i29i30.4066j0j7&sourceid=chrome&ie=UTF-8",
    "title": "firefox sqlite datbase schema - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:57:51",
    "visit_date": "2023-03-19 21:57:51",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12195,
    "url": "https://wiki.mozilla.org/File:Places.sqlite.schema.pdf",
    "title": "File:Places.sqlite.schema.pdf - MozillaWiki",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:58:21",
    "visit_date": "2023-03-19 21:57:54",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12196,
    "url": "https://mozilla.github.io/firefox-browser-architecture/text/0010-firefox-data-stores.html",
    "title": "Firefox Data Stores",
    "visit_count": 1,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:58:02",
    "visit_date": "2023-03-19 21:58:02",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12197,
    "url": "https://www.google.com/search?q=firefox+sqlite+history+schemas&ei=XoUXZNfqO-P0qwHF5pSQDw&ved=0ahUKEwjXkdK2_uj9AhVj-ioKHUUzBfIQ4dUDCA8&uact=5&oq=firefox+sqlite+history+schemas&gs_lcp=Cgxnd3Mtd2l6LXNlcnAQAzIICCEQoAEQwwQyCAghEKABEMMEMggIIRCgARDDBDoICAAQhgMQsAM6BAgAEB46BggAEAgQHjoFCAAQhgM6CgghEKABEMMEEApKBAhBGAFQxQJYwhZg0BdoAXAAeACAAb0CiAGvC5IBBzAuNi4xLjGYAQCgAQHIAQTAAQE&sclient=gws-wiz-serp",
    "title": "firefox sqlite history schemas - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:58:21",
    "visit_date": "2023-03-19 21:58:20",
    "from_visit": 34,
    "visit_type": "submit",
    "browser": "Chrome"
  },
  {
    "id": 12197,
    "url": "https://www.google.com/search?q=firefox+sqlite+history+schemas&ei=XoUXZNfqO-P0qwHF5pSQDw&ved=0ahUKEwjXkdK2_uj9AhVj-ioKHUUzBfIQ4dUDCA8&uact=5&oq=firefox+sqlite+history+schemas&gs_lcp=Cgxnd3Mtd2l6LXNlcnAQAzIICCEQoAEQwwQyCAghEKABEMMEMggIIRCgARDDBDoICAAQhgMQsAM6BAgAEB46BggAEAgQHjoFCAAQhgM6CgghEKABEMMEEApKBAhBGAFQxQJYwhZg0BdoAXAAeACAAb0CiAGvC5IBBzAuNi4xLjGYAQCgAQHIAQTAAQE&sclient=gws-wiz-serp",
    "title": "firefox sqlite history schemas - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:58:21",
    "visit_date": "2023-03-19 21:58:21",
    "from_visit": 37,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12195,
    "url": "https://wiki.mozilla.org/File:Places.sqlite.schema.pdf",
    "title": "File:Places.sqlite.schema.pdf - MozillaWiki",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:58:21",
    "visit_date": "2023-03-19 21:58:21",
    "from_visit": 38,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12198,
    "url": "https://wiki.mozilla.org/images/0/08/Places.sqlite.schema.pdf",
    "title": "Places.sqlite.schema.pdf",
    "visit_count": 1,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:58:24",
    "visit_date": "2023-03-19 21:58:24",
    "from_visit": 39,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12199,
    "url": "https://www.google.com/search?q=moz_places_metadata&oq=moz_places_metadata&aqs=chrome..69i57.4730j0j7&sourceid=chrome&ie=UTF-8",
    "title": "moz_places_metadata - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:04:02",
    "visit_date": "2023-03-19 22:04:01",
    "from_visit": 0,
    "visit_type": "generated",
    "browser": "Chrome"
  },
  {
    "id": 12199,
    "url": "https://www.google.com/search?q=moz_places_metadata&oq=moz_places_metadata&aqs=chrome..69i57.4730j0j7&sourceid=chrome&ie=UTF-8",
    "title": "moz_places_metadata - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:04:02",
    "visit_date": "2023-03-19 22:04:02",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12200,
    "url": "https://raw.githubusercontent.com/mozilla/gecko-dev/master/toolkit/components/places/nsPlacesIndexes.h",
    "title": "",
    "visit_count": 1,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:04:14",
    "visit_date": "2023-03-19 22:04:14",
    "from_visit": 42,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12201,
    "url": "https://www.google.com/search?q=moz_places_metadata&oq=moz_places_metadata&aqs=chrome.0.69i59.2125j0j7&sourceid=chrome&ie=UTF-8",
    "title": "moz_places_metadata - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:05:04",
    "visit_date": "2023-03-19 22:05:04",
    "from_visit": 0,
    "visit_type": "generated",
    "browser": "Chrome"
  },
  {
    "id": 12201,
    "url": "https://www.google.com/search?q=moz_places_metadata&oq=moz_places_metadata&aqs=chrome.0.69i59.2125j0j7&sourceid=chrome&ie=UTF-8",
    "title": "moz_places_metadata - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:05:04",
    "visit_date": "2023-03-19 22:05:04",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12191,
    "url": "https://github.com/tomasraposo/ir-search-engine/blob/714f37b9808718ebae220c8f64e7e83070d0117e/src/aggregator.ipynb",
    "title": "ir-search-engine/aggregator.ipynb at 714f37b9808718ebae220c8f64e7e83070d0117e \u00b7 tomasraposo/ir-search-engine",
    "visit_count": 3,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:19:42",
    "visit_date": "2023-03-19 22:19:42",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12216,
    "url": "https://github.com/tomasraposo/ir-search-engine",
    "title": "tomasraposo/ir-search-engine",
    "visit_count": 1,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:19:44",
    "visit_date": "2023-03-19 22:19:44",
    "from_visit": 69,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12217,
    "url": "https://github.com/tomasraposo/ir-search-engine/tree/aggregator",
    "title": "tomasraposo/ir-search-engine at aggregator",
    "visit_count": 1,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:20:08",
    "visit_date": "2023-03-19 22:20:08",
    "from_visit": 70,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 1,
    "url": "https://www.google.com/search?channel=fs&client=ubuntu&q=mozilla+sqlite+schemas+",
    "title": "mozilla sqlite schemas - Google Search",
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:18:51",
    "visit_date": "2023-03-19 21:18:51",
    "from_visit": 0,
    "visit_type": "typed",
    "browser": "Firefox"
  },
  {
    "id": 2,
    "url": "https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&ved=2ahUKEwjus4rb9ej9AhUKt4sKHQ91AZUQFnoECA0QAQ&url=https%3A%2F%2Fwiki.mozilla.org%2Fimages%2F0%2F08%2FPlaces.sqlite.schema.pdf&usg=AOvVaw1VqHh-NQHUFYqoK6-DldIH",
    "title": null,
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:18:55",
    "visit_date": "2023-03-19 21:18:55",
    "from_visit": 1,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 3,
    "url": "https://wiki.mozilla.org/images/0/08/Places.sqlite.schema.pdf",
    "title": "Places.sqlite.schema.pdf",
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:18:56",
    "visit_date": "2023-03-19 21:18:56",
    "from_visit": 2,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 4,
    "url": "https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&ved=2ahUKEwjus4rb9ej9AhUKt4sKHQ91AZUQFnoECA4QAQ&url=https%3A%2F%2Fwiki.mozilla.org%2Fimages%2F7%2F72%2FContent-prefs.sqlite.schema.pdf&usg=AOvVaw2xp8uTcWWZhEur4dMUmp4v",
    "title": null,
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:18:59",
    "visit_date": "2023-03-19 21:18:59",
    "from_visit": 1,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 5,
    "url": "https://wiki.mozilla.org/images/7/72/Content-prefs.sqlite.schema.pdf",
    "title": "Content-prefs.sqlite.schema.pdf",
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:19:01",
    "visit_date": "2023-03-19 21:19:01",
    "from_visit": 4,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 6,
    "url": "https://www.google.com/search?q=sqlite+documentation&client=ubuntu&hs=qFP&channel=fs&ei=O3wXZK6qHYrurgSP6oWoCQ&ved=0ahUKEwjus4rb9ej9AhUKt4sKHQ91AZUQ4dUDCGo&uact=5&oq=sqlite+documentation&gs_lcp=Cgxnd3Mtd2l6LXNlcnAQAzIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQyBQgAEIAEMgUIABCABDIGCAAQFhAeMgYIABAWEB4yBggAEBYQHjIGCAAQFhAeOgoIABBHENYEELADOgQIABBDOgUIABCRAjoLCC4QgAQQxwEQ0QM6BQgAEIYDSgQIQRgAUJsJWPcXYPYaaANwAXgAgAGVAYgB-BCSAQQ1LjE1mAEAoAEByAECwAEB&sclient=gws-wiz-serp",
    "title": "sqlite documentation - Google Search",
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:19:07",
    "visit_date": "2023-03-19 21:19:07",
    "from_visit": 1,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 7,
    "url": "file:///home/tomasraposo/.local/share/jupyter/runtime/nbserver-962520-open.html",
    "title": "Opening Jupyter Notebook",
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:19:11",
    "visit_date": "2023-03-19 21:19:11",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 8,
    "url": "http://localhost:8888/tree?token=1de4774f1cd881f1ed29059dd80fc03ec3f54e40761b2f0c",
    "title": "Home Page - Select or create a notebook",
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:19:12",
    "visit_date": "2023-03-19 21:19:12",
    "from_visit": 7,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 9,
    "url": "http://localhost:8888/tree",
    "title": "Home Page - Select or create a notebook",
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:19:12",
    "visit_date": "2023-03-19 21:19:12",
    "from_visit": 8,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 10,
    "url": "http://localhost:8888/notebooks/aggregator.ipynb",
    "title": "aggregator - Jupyter Notebook",
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:19:26",
    "visit_date": "2023-03-19 21:19:26",
    "from_visit": 9,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 11,
    "url": "http://localhost:8888/notebooks/aggregator.ipynb#",
    "title": "aggregator - Jupyter Notebook",
    "visit_count": 2,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:27:19",
    "visit_date": "2023-03-19 21:20:25",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 14,
    "url": "http://localhost:8888/notebooks/aggregator.ipynb#",
    "title": "aggregator - Jupyter Notebook",
    "visit_count": 2,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:27:19",
    "visit_date": "2023-03-19 21:27:19",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 15,
    "url": "http://localhost:8888/notebooks/Untitled1.ipynb?kernel_name=python3",
    "title": "Untitled1 - Jupyter Notebook",
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:27:20",
    "visit_date": "2023-03-19 21:27:20",
    "from_visit": 14,
    "visit_type": "link",
    "browser": "Firefox"
  }
]
""")
print(f'Total number of docs: {len(docs)}')

In [4]:
# instantiate the search engine
try:
    se = SearchEngine(host)
except Exception as e:
    print(str(e))

Ok: cluster is up


In [None]:
# se.cluster_info()

In [5]:
# you can manually delete an index or all - * - if you feel you messed somewhere
se.client.indices.delete(index = '*')

{'acknowledged': True}

## Ranking models
_BM25 similarity (default)_

Note: we're required to explain how ranking works for each of the models used, i.e. how it reflects in the documents returned.


### BM25 (Best Match Okapi)
This is the default ranking model used by Elasticsearch

In [6]:
sim_bm25  = {
    'sim_bm25' : {
        'type' : 'BM25',
        'b' : '0.75',
        'k1' : 1.2
    }
}

In [7]:
# create index
index_name = 'history'
_doc = 'browser_history'
try:
    se.create_index(index_name, _doc, sim_bm25)
except:
    pass

Ok: success: 77; fail: []
OK: index history created.


In [8]:
se.index_info()

[
  {
    "health": "yellow",
    "status": "open",
    "index": "history",
    "uuid": "7RZX439yQOKpw81_LHd5FQ",
    "pri": "1",
    "rep": "1",
    "docs.count": "77",
    "docs.deleted": "0",
    "store.size": "55.4kb",
    "pri.store.size": "55.4kb"
  }
]


In [9]:
# get all records
q = se.query(index_name)
print(json.dumps(q, indent = 2))

{
  "took": 2,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 77,
      "relation": "eq"
    },
    "max_score": 1.0,
    "hits": [
      {
        "_shard": "[history][0]",
        "_node": "Hi5B2wvrRh-CzeJ_Uc0yMw",
        "_index": "history",
        "_type": "_doc",
        "_id": "0",
        "_score": 1.0,
        "_source": {
          "id": 12179,
          "url": "https://www.google.com/search?q=go&oq=go&aqs=chrome..69i57j46i131i199i433i465i512j0i433i512j0i131i433i512j0i433i512j69i65l3.440j0j4&sourceid=chrome&ie=UTF-8",
          "title": "go - Google Search",
          "visit_count": 2,
          "typed_count": 0,
          "last_visit_date": "2023-04-09 16:24:16",
          "visit_date": "2023-04-09 16:24:15",
          "from_visit": 0,
          "visit_type": "generated",
          "browser": "Chrome"
        },
        "_explanation": {
          "value": 1.0,
      

In [None]:
# search for `sqlite` in  title
query = {
    'query' : {
        'term'  : {
            'title' : 'sqlite'
        } 
    } 
}


res = se.query(index_name, body = query)
hits = se.get_hits(res, *['url', 'title', 'visit_date', 'last_visit_date'], explain = False, fmt = 'ascii')
# if you want to print as json use `json.dumps(hits, indent = 2)`
print(hits)

In [None]:
# the bool clause allows us to build boolean expressions
# `should` behaves like an OR clause whereas `must` behaves like an AND clause
# this can be used across many fields

# search for either keyword in title
keywords = 'sqlite documentation history'
query = {
    'query' : {
        'bool' : {
            'should' : [
                {
                    'terms' : {
                        'title' : keywords.split(' ')
                    }
                }
            ]
        }
    }
}
res = se.query(index_name, body = query)
hits = se.get_hits(res, *['url', 'title', 'visit_date', 'last_visit_date'], explain = False)
print(json.dumps(hits, indent = 2))

In [None]:
# search for documents whose `last_visit_date` is more recent than March 29 and contain
# either history on the `title` or google somewhere in the URL
# note: /google/ is a regex pattern.
query = {
    'query' : {
        'bool' : {
            'must' : [
                {
                    'range' : {
                        'last_visit_date' : {
                            'gte' : '2023-29-03'
                        }
                    }
                }
            ],
            'should' : [
                {
                    'match'  : { 
                        'title' : 'history'
                    },
                    'match' : {
                        'url' :  '/google/'
                    }
                }
            ]
        }
    }
}
res = se.query(index_name, body = query)
hits = se.get_hits(res, *['url', 'title', 'visit_date', 'last_visit_date'], explain = False)
print(json.dumps(hits, indent = 2))

In [None]:
query = {
    'query': {
        'range': {
            'last_visit_date' : {
#                 "gte": "2023-01-01 00:00:00",
                "lte": "2023-03-18 00:00:00",
                "format": "yyyy-MM-dd HH-mm-ss"
            }
        }
    }
}
res = se.query(index_name, body = query)
hits = se.get_hits(res, *['url', 'title', 'visit_date', 'last_visit_date'], explain = False)
print(json.dumps(hits, indent = 2))

### DFR (Divergence from Randomness)
This model takes into account statistical properties of the collection, e.g. frequency and distribution of terms within the collection, length of documents, etc.  

In [None]:
sim_dfr = {
    "sim_dfr": {
      "type": "DFR",
      "basic_model": "g",
      "after_effect": "l",
      "normalization": "h2",
      "normalization.h2.c": "2.0"
    }
}
se.update_ranking_model(index_name, sim_dfr)

In [None]:
se.index_info()

### BM25F
This can be achieved using a `multi_match` query, which allows us to assign different weights to each field.

In [None]:
sim_bm25f = {
    "sim_bm25f": {
        'type' : 'BM25',
        'b' : '0.75',
        'k1' : 1.2
    }
}
# we have to update using the actual index name not the alias
# this is a known issue in Elasticsearch
# you can check the index name with `se.index_info()` in the cell above
se.update_ranking_model(f'{index_name}', sim_bm25f)

In [None]:
se.index_info()

In [None]:
# assign weights to fields
fields = [
    f'{field}^{str(weight)}' for field, weight in zip(
        ['title', 'url', 'browser', 'last_visit_date'], 
        [1, 2, 3, 4]
    )
]
fields

In [None]:
# `combined_fields` was introduced in v7.13 and truly implements BM25F
# as we running an old version (as provided by the lecturer), 
# we have to either use `multi_match` or bump the version (not sure we can)
# see: https://opensourceconnections.com/blog/2021/06/30/better-term-centric-scoring-in-elasticsearch-with-bm25f-and-the-combined_fields-query/

query = {
    'query' : {
        'multi_match' : {
            'query' : 'sqlite',
            'fields' : fields,
            'type' : 'cross_fields'
        }
    }
}

q1 = se.query(index_name, body = query)
hits = se.get_hits(q1, *['url', 'title', 'visit_date', 'last_visit_date'], explain = False)
print(json.dumps(hits, indent = 2))

### Evaluation

In [None]:
# Evaluates how well documents match a given query. 

# The test suite contains 3-tuples of the form query id, document id and score. 
# They are used to calculate the average effectiveness of the search engine by
# calculating the recall and precision values for each query.

# Subsequently the F-score of the function will be calculated. 

# Given these characteristics, we will also calculate the fall-out to understand 
# the probability of a non-relevant document being retrieved by the function.

# The relevance judgments are prepared as a single suite for all queries under testing

# BM25 or BM25F is going to be the ranking model we use

# Query 1:
# search for `sqlite` in  title
query1 = {
    'query' : {
        'term'  : {
            'title' : 'sqlite'
        } 
    } 
}

res = se.query(index_name, body = query1)
hits = se.get_hits(res, *['url', 'title', 'visit_date', 'last_visit_date'], explain = False, fmt = 'ascii')
print(hits)

# prepare relevance judgments
q1_rj = [
    (0, doc_id, score) 
    for doc_id, score in zip(
        range(37),
        [2,3,0,0,0,0,0,0,2,3,1,0,0,0,0,0]
    )
]

# calculate recall, precision, F-score, fall out
# ...

### User Interface

In [8]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.display import display, clear_output

In [9]:
query_types = {
    'match' : lambda field, query : {'query': {'match': {field: query}}},
    'term' : lambda field, query : {'query': {'term': {field: query}}},
    'range' : lambda field, query : {'query': {'range': {field: {'lte': query}}}},
    'multi_match' :  lambda field, query : {'query': {'multi_match': {'query': query, 'type': 'cross_fields', 'fields': field}}}    
}

In [10]:
class SearchInterface:
    def __init__(self):
        self.search_area = widgets.Text(
            value = '',
            placeholder = 'Enter search...',
            description = 'Search:',
            layout = widgets.Layout(
                width ='50%', 
                height = '30px'
            )
        )

        self.query_type_dd = widgets.Dropdown(
            options = ['match', 'term', 'range', 'multi_match'],
            value = 'match',
            description = 'Query type:'
        )

        self.field_input = widgets.Text(
            value = '',
            options = ['url', 'title', 'last_visit_date'],
            placeholder = 'Enter field...',
            description = 'Field:',
            layout = widgets.Layout(
                width = '50%',
                height = '30px'
            ),
        )

        self.weight_input = widgets.Text(
            value = '',
            placeholder = 'Enter weights...',
            description = 'Weights:',
            layoyt = widgets.Layout(
                width = '50%',
                height = '30px'
            )
        )

        self.output = widgets.Output()

    def search_on_change(self, change):
        query_type = self.query_type_dd.value
        field = self.field_input.value.strip()
        weight = self.weight_input.value.strip()
        query = self.search_area.value.strip()

        query_f = query_types[query_type]
        with self.output:
            clear_output()
            if query:
                args = [field, query]
                if query_type == 'multi_match':
                    fields = [
                        f'{field}^{weight}' 
                        for field, weight in zip(
                            field.split(' '),
                            weight.split(' ')
                        )
                    ]
                    args[0] = fields                    
                res = se.query(index_name, body = query_f(*args))
                hits = se.get_hits(res, *['url', 'title', 'visit_date', 'last_visit_date'], explain = False, fmt = 'ascii')
                print(hits)            
    
    def display(self):
        self.search_area.observe(self.search_on_change, names='value')
        # render widgets
        display(self.query_type_dd, self.field_input, self.weight_input, self.search_area, self.output)

            
SearchInterface().display()

Dropdown(description='Query type:', options=('match', 'term', 'range', 'multi_match'), value='match')

Text(value='', description='Field:', layout=Layout(height='30px', width='50%'), placeholder='Enter field...')

Text(value='', description='Weights:', placeholder='Enter weights...')

Text(value='', description='Search:', layout=Layout(height='30px', width='50%'), placeholder='Enter search...'…

Output()