In [1]:
import elasticsearch as es
from datetime import datetime as dt
import json

In [2]:
class SearchEngine:
    def __init__(self, host):
        self.client = es.Elasticsearch(host)

    def __enter__(self):
        ping = self.client.ping()
        if not ping:
            raise Exception('Error: could not connect to cluster')
        print('Ok: cluster is up')
        return self
    
    def cluster_info(self):
        print(json.dumps(self.client.info(), indent = 2))
    
    def build_index(self):
        pass
    
    def _convert_to_date(self, field):
        try:
            date = dt.strptime(field, '%Y-%m-%d %H:%M:%S')
            return date
        except:
            return field
#         return field
            
    def extract_mapping(self, sample):
        sample_ = sample.copy()
        sanitised_vals = [*map(self._convert_to_date, list(sample_.values()))]
        sample_.update(
            (field, val) 
            for field, val in zip(
                sample_.keys(), sanitised_vals
            )
        )   
        print(sample_)
        types = {
            'int'      : 'integer',
            'str'      : 'text',
            'datetime' : 'date'
        }
        return {
            'mapping' : {
                'properties' : {
                    property_ : { 
                        'type' : types[type(property_val).__name__] 
                    }
                    for property_, property_val in sample_.items()
                }
            }
        }
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        pass
    

In [3]:
# mocks dataset from aggregator
docs = json.loads("""
[
  {
    "id": 12181,
    "url": "https://www.google.com/search?q=sqlite+chrome+history&oq=sqlite+chrome+&aqs=chrome.3.69i57j0i512l3j0i22i30l6.3022j0j4&sourceid=chrome&ie=UTF-8",
    "title": "sqlite chrome history - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:50:14",
    "visit_date": "2023-03-19 21:50:13",
    "from_visit": 0,
    "visit_type": "generated",
    "browser": "Chrome"
  },
  {
    "id": 12181,
    "url": "https://www.google.com/search?q=sqlite+chrome+history&oq=sqlite+chrome+&aqs=chrome.3.69i57j0i512l3j0i22i30l6.3022j0j4&sourceid=chrome&ie=UTF-8",
    "title": "sqlite chrome history - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:50:14",
    "visit_date": "2023-03-19 21:50:14",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12182,
    "url": "https://en.wikiversity.org/wiki/Chromium_browsing_history_database",
    "title": "Chromium browsing history database - Wikiversity",
    "visit_count": 1,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:50:26",
    "visit_date": "2023-03-19 21:50:26",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12183,
    "url": "https://www.researchgate.net/figure/Chrome-history-SQLite-The-highlighted-record-corresponds-to-a-bookmark-added-in-the_fig1_262880203",
    "title": "Chrome history SQLite. The highlighted record corresponds to a bookmark... | Download Scientific Diagram",
    "visit_count": 1,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:50:27",
    "visit_date": "2023-03-19 21:50:27",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12191,
    "url": "https://github.com/tomasraposo/ir-search-engine/blob/714f37b9808718ebae220c8f64e7e83070d0117e/src/aggregator.ipynb",
    "title": "ir-search-engine/aggregator.ipynb at 714f37b9808718ebae220c8f64e7e83070d0117e \u00b7 tomasraposo/ir-search-engine",
    "visit_count": 3,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:19:42",
    "visit_date": "2023-03-19 21:55:12",
    "from_visit": 25,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12191,
    "url": "https://github.com/tomasraposo/ir-search-engine/blob/714f37b9808718ebae220c8f64e7e83070d0117e/src/aggregator.ipynb",
    "title": "ir-search-engine/aggregator.ipynb at 714f37b9808718ebae220c8f64e7e83070d0117e \u00b7 tomasraposo/ir-search-engine",
    "visit_count": 3,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:19:42",
    "visit_date": "2023-03-19 21:55:12",
    "from_visit": 27,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12192,
    "url": "https://www.google.com/search?q=firefox+host&oq=firefox+host&aqs=chrome..69i57j0i512l7j0i22i30l2.3143j0j7&sourceid=chrome&ie=UTF-8",
    "title": "firefox host - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:57:08",
    "visit_date": "2023-03-19 21:57:07",
    "from_visit": 0,
    "visit_type": "generated",
    "browser": "Chrome"
  },
  {
    "id": 12192,
    "url": "https://www.google.com/search?q=firefox+host&oq=firefox+host&aqs=chrome..69i57j0i512l7j0i22i30l2.3143j0j7&sourceid=chrome&ie=UTF-8",
    "title": "firefox host - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:57:08",
    "visit_date": "2023-03-19 21:57:08",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12194,
    "url": "https://www.google.com/search?q=firefox+sqlite+datbase+schema&oq=firefox+sqlite+datbase+schema&aqs=chrome..69i57j33i10i160j33i10i22i29i30l5j33i10i15i22i29i30.4066j0j7&sourceid=chrome&ie=UTF-8",
    "title": "firefox sqlite datbase schema - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:57:51",
    "visit_date": "2023-03-19 21:57:50",
    "from_visit": 0,
    "visit_type": "generated",
    "browser": "Chrome"
  },
  {
    "id": 12194,
    "url": "https://www.google.com/search?q=firefox+sqlite+datbase+schema&oq=firefox+sqlite+datbase+schema&aqs=chrome..69i57j33i10i160j33i10i22i29i30l5j33i10i15i22i29i30.4066j0j7&sourceid=chrome&ie=UTF-8",
    "title": "firefox sqlite datbase schema - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:57:51",
    "visit_date": "2023-03-19 21:57:51",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12195,
    "url": "https://wiki.mozilla.org/File:Places.sqlite.schema.pdf",
    "title": "File:Places.sqlite.schema.pdf - MozillaWiki",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:58:21",
    "visit_date": "2023-03-19 21:57:54",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12196,
    "url": "https://mozilla.github.io/firefox-browser-architecture/text/0010-firefox-data-stores.html",
    "title": "Firefox Data Stores",
    "visit_count": 1,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:58:02",
    "visit_date": "2023-03-19 21:58:02",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12197,
    "url": "https://www.google.com/search?q=firefox+sqlite+history+schemas&ei=XoUXZNfqO-P0qwHF5pSQDw&ved=0ahUKEwjXkdK2_uj9AhVj-ioKHUUzBfIQ4dUDCA8&uact=5&oq=firefox+sqlite+history+schemas&gs_lcp=Cgxnd3Mtd2l6LXNlcnAQAzIICCEQoAEQwwQyCAghEKABEMMEMggIIRCgARDDBDoICAAQhgMQsAM6BAgAEB46BggAEAgQHjoFCAAQhgM6CgghEKABEMMEEApKBAhBGAFQxQJYwhZg0BdoAXAAeACAAb0CiAGvC5IBBzAuNi4xLjGYAQCgAQHIAQTAAQE&sclient=gws-wiz-serp",
    "title": "firefox sqlite history schemas - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:58:21",
    "visit_date": "2023-03-19 21:58:20",
    "from_visit": 34,
    "visit_type": "submit",
    "browser": "Chrome"
  },
  {
    "id": 12197,
    "url": "https://www.google.com/search?q=firefox+sqlite+history+schemas&ei=XoUXZNfqO-P0qwHF5pSQDw&ved=0ahUKEwjXkdK2_uj9AhVj-ioKHUUzBfIQ4dUDCA8&uact=5&oq=firefox+sqlite+history+schemas&gs_lcp=Cgxnd3Mtd2l6LXNlcnAQAzIICCEQoAEQwwQyCAghEKABEMMEMggIIRCgARDDBDoICAAQhgMQsAM6BAgAEB46BggAEAgQHjoFCAAQhgM6CgghEKABEMMEEApKBAhBGAFQxQJYwhZg0BdoAXAAeACAAb0CiAGvC5IBBzAuNi4xLjGYAQCgAQHIAQTAAQE&sclient=gws-wiz-serp",
    "title": "firefox sqlite history schemas - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:58:21",
    "visit_date": "2023-03-19 21:58:21",
    "from_visit": 37,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12195,
    "url": "https://wiki.mozilla.org/File:Places.sqlite.schema.pdf",
    "title": "File:Places.sqlite.schema.pdf - MozillaWiki",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:58:21",
    "visit_date": "2023-03-19 21:58:21",
    "from_visit": 38,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12198,
    "url": "https://wiki.mozilla.org/images/0/08/Places.sqlite.schema.pdf",
    "title": "Places.sqlite.schema.pdf",
    "visit_count": 1,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 21:58:24",
    "visit_date": "2023-03-19 21:58:24",
    "from_visit": 39,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12199,
    "url": "https://www.google.com/search?q=moz_places_metadata&oq=moz_places_metadata&aqs=chrome..69i57.4730j0j7&sourceid=chrome&ie=UTF-8",
    "title": "moz_places_metadata - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:04:02",
    "visit_date": "2023-03-19 22:04:01",
    "from_visit": 0,
    "visit_type": "generated",
    "browser": "Chrome"
  },
  {
    "id": 12199,
    "url": "https://www.google.com/search?q=moz_places_metadata&oq=moz_places_metadata&aqs=chrome..69i57.4730j0j7&sourceid=chrome&ie=UTF-8",
    "title": "moz_places_metadata - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:04:02",
    "visit_date": "2023-03-19 22:04:02",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12200,
    "url": "https://raw.githubusercontent.com/mozilla/gecko-dev/master/toolkit/components/places/nsPlacesIndexes.h",
    "title": "",
    "visit_count": 1,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:04:14",
    "visit_date": "2023-03-19 22:04:14",
    "from_visit": 42,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12201,
    "url": "https://www.google.com/search?q=moz_places_metadata&oq=moz_places_metadata&aqs=chrome.0.69i59.2125j0j7&sourceid=chrome&ie=UTF-8",
    "title": "moz_places_metadata - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:05:04",
    "visit_date": "2023-03-19 22:05:04",
    "from_visit": 0,
    "visit_type": "generated",
    "browser": "Chrome"
  },
  {
    "id": 12201,
    "url": "https://www.google.com/search?q=moz_places_metadata&oq=moz_places_metadata&aqs=chrome.0.69i59.2125j0j7&sourceid=chrome&ie=UTF-8",
    "title": "moz_places_metadata - Google Search",
    "visit_count": 2,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:05:04",
    "visit_date": "2023-03-19 22:05:04",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12191,
    "url": "https://github.com/tomasraposo/ir-search-engine/blob/714f37b9808718ebae220c8f64e7e83070d0117e/src/aggregator.ipynb",
    "title": "ir-search-engine/aggregator.ipynb at 714f37b9808718ebae220c8f64e7e83070d0117e \u00b7 tomasraposo/ir-search-engine",
    "visit_count": 3,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:19:42",
    "visit_date": "2023-03-19 22:19:42",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12216,
    "url": "https://github.com/tomasraposo/ir-search-engine",
    "title": "tomasraposo/ir-search-engine",
    "visit_count": 1,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:19:44",
    "visit_date": "2023-03-19 22:19:44",
    "from_visit": 69,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 12217,
    "url": "https://github.com/tomasraposo/ir-search-engine/tree/aggregator",
    "title": "tomasraposo/ir-search-engine at aggregator",
    "visit_count": 1,
    "typed_count": 0,
    "last_visit_date": "2023-03-19 22:20:08",
    "visit_date": "2023-03-19 22:20:08",
    "from_visit": 70,
    "visit_type": "link",
    "browser": "Chrome"
  },
  {
    "id": 1,
    "url": "https://www.google.com/search?channel=fs&client=ubuntu&q=mozilla+sqlite+schemas+",
    "title": "mozilla sqlite schemas - Google Search",
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:18:51",
    "visit_date": "2023-03-19 21:18:51",
    "from_visit": 0,
    "visit_type": "typed",
    "browser": "Firefox"
  },
  {
    "id": 2,
    "url": "https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&ved=2ahUKEwjus4rb9ej9AhUKt4sKHQ91AZUQFnoECA0QAQ&url=https%3A%2F%2Fwiki.mozilla.org%2Fimages%2F0%2F08%2FPlaces.sqlite.schema.pdf&usg=AOvVaw1VqHh-NQHUFYqoK6-DldIH",
    "title": null,
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:18:55",
    "visit_date": "2023-03-19 21:18:55",
    "from_visit": 1,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 3,
    "url": "https://wiki.mozilla.org/images/0/08/Places.sqlite.schema.pdf",
    "title": "Places.sqlite.schema.pdf",
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:18:56",
    "visit_date": "2023-03-19 21:18:56",
    "from_visit": 2,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 4,
    "url": "https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&ved=2ahUKEwjus4rb9ej9AhUKt4sKHQ91AZUQFnoECA4QAQ&url=https%3A%2F%2Fwiki.mozilla.org%2Fimages%2F7%2F72%2FContent-prefs.sqlite.schema.pdf&usg=AOvVaw2xp8uTcWWZhEur4dMUmp4v",
    "title": null,
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:18:59",
    "visit_date": "2023-03-19 21:18:59",
    "from_visit": 1,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 5,
    "url": "https://wiki.mozilla.org/images/7/72/Content-prefs.sqlite.schema.pdf",
    "title": "Content-prefs.sqlite.schema.pdf",
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:19:01",
    "visit_date": "2023-03-19 21:19:01",
    "from_visit": 4,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 6,
    "url": "https://www.google.com/search?q=sqlite+documentation&client=ubuntu&hs=qFP&channel=fs&ei=O3wXZK6qHYrurgSP6oWoCQ&ved=0ahUKEwjus4rb9ej9AhUKt4sKHQ91AZUQ4dUDCGo&uact=5&oq=sqlite+documentation&gs_lcp=Cgxnd3Mtd2l6LXNlcnAQAzIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQyBQgAEIAEMgUIABCABDIGCAAQFhAeMgYIABAWEB4yBggAEBYQHjIGCAAQFhAeOgoIABBHENYEELADOgQIABBDOgUIABCRAjoLCC4QgAQQxwEQ0QM6BQgAEIYDSgQIQRgAUJsJWPcXYPYaaANwAXgAgAGVAYgB-BCSAQQ1LjE1mAEAoAEByAECwAEB&sclient=gws-wiz-serp",
    "title": "sqlite documentation - Google Search",
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:19:07",
    "visit_date": "2023-03-19 21:19:07",
    "from_visit": 1,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 7,
    "url": "file:///home/tomasraposo/.local/share/jupyter/runtime/nbserver-962520-open.html",
    "title": "Opening Jupyter Notebook",
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:19:11",
    "visit_date": "2023-03-19 21:19:11",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 8,
    "url": "http://localhost:8888/tree?token=1de4774f1cd881f1ed29059dd80fc03ec3f54e40761b2f0c",
    "title": "Home Page - Select or create a notebook",
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:19:12",
    "visit_date": "2023-03-19 21:19:12",
    "from_visit": 7,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 9,
    "url": "http://localhost:8888/tree",
    "title": "Home Page - Select or create a notebook",
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:19:12",
    "visit_date": "2023-03-19 21:19:12",
    "from_visit": 8,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 10,
    "url": "http://localhost:8888/notebooks/aggregator.ipynb",
    "title": "aggregator - Jupyter Notebook",
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:19:26",
    "visit_date": "2023-03-19 21:19:26",
    "from_visit": 9,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 11,
    "url": "http://localhost:8888/notebooks/aggregator.ipynb#",
    "title": "aggregator - Jupyter Notebook",
    "visit_count": 2,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:27:19",
    "visit_date": "2023-03-19 21:20:25",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 14,
    "url": "http://localhost:8888/notebooks/aggregator.ipynb#",
    "title": "aggregator - Jupyter Notebook",
    "visit_count": 2,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:27:19",
    "visit_date": "2023-03-19 21:27:19",
    "from_visit": 0,
    "visit_type": "link",
    "browser": "Firefox"
  },
  {
    "id": 15,
    "url": "http://localhost:8888/notebooks/Untitled1.ipynb?kernel_name=python3",
    "title": "Untitled1 - Jupyter Notebook",
    "visit_count": 1,
    "typed_count": "0",
    "last_visit_date": "2023-03-19 21:27:20",
    "visit_date": "2023-03-19 21:27:20",
    "from_visit": 14,
    "visit_type": "link",
    "browser": "Firefox"
  }
]
""")

In [4]:
port = 9200
host = f'http://127.0.0.1:{port}'

try:
    with SearchEngine(host) as se:
        # se.cluster_info()
        sample_doc = docs[0]
        mapping = se.extract_mapping(sample_doc)
        print(json.dumps(mapping, indent = 2))
except Exception as e:
    print(str(e))

Ok: cluster is up
{'id': 12181, 'url': 'https://www.google.com/search?q=sqlite+chrome+history&oq=sqlite+chrome+&aqs=chrome.3.69i57j0i512l3j0i22i30l6.3022j0j4&sourceid=chrome&ie=UTF-8', 'title': 'sqlite chrome history - Google Search', 'visit_count': 2, 'typed_count': 0, 'last_visit_date': datetime.datetime(2023, 3, 19, 21, 50, 14), 'visit_date': datetime.datetime(2023, 3, 19, 21, 50, 13), 'from_visit': 0, 'visit_type': 'generated', 'browser': 'Chrome'}
{
  "mapping": {
    "properties": {
      "id": {
        "type": "integer"
      },
      "url": {
        "type": "text"
      },
      "title": {
        "type": "text"
      },
      "visit_count": {
        "type": "integer"
      },
      "typed_count": {
        "type": "integer"
      },
      "last_visit_date": {
        "type": "date"
      },
      "visit_date": {
        "type": "date"
      },
      "from_visit": {
        "type": "integer"
      },
      "visit_type": {
        "type": "text"
      },
      "browser": {
  