In [23]:
## INITIALIZATION
# Install ElasticSearch pip package in current Jupyter kernel
import sys
!{sys.executable} -m pip install elasticsearch

# Imports
import elasticsearch

# Settings - *** YOU MUST EDIT THEM! ***
settings = {
    'es_host': 'localhost',
    'es_port': '9200',
    'es_index': 'nordic-design',
}

# Init Elastic Search handler
es = elasticsearch.Elasticsearch([{'host': settings['es_host'], 'port': settings['es_port']}])



twisted 18.7.0 requires PyHamcrest>=1.9.0, which is not installed.
You are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [36]:
# EXAMPLE QUERY 1: All web entities
es.search(
    index=settings['es_index'],
    body={
        "query": {
            "match": { "type": "webentity" }
        },
        "size": 3 # Limit to 3 for convenience
    }
)

{'took': 1,
 'timed_out': False,
 '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
 'hits': {'total': 254,
  'max_score': 3.5556765,
  'hits': [{'_index': 'nordic-design',
    '_type': 'doc',
    '_id': '124',
    '_score': 3.5556765,
    '_source': {'status': 'IN',
     'type': 'webentity',
     'name': 'Hankjobenhavn.com',
     'crawling_status': 'FINISHED',
     'indegree': 4,
     'tags': {'CORE-STARTPAGES': {'user': ['http://hankjobenhavn.com']},
      'USER': {'Country': ['Denmark'], 'Branch': ['Company']},
      'CORE': {'createdBy': ['user via lru']}},
     'startpages': ['http://hankjobenhavn.com'],
     'creation_date': 1539159539130,
     'prefixes': ['s:http|h:com|h:hankjobenhavn|h:www|',
      's:http|h:com|h:hankjobenhavn|',
      's:https|h:com|h:hankjobenhavn|',
      's:https|h:com|h:hankjobenhavn|h:www|'],
     'indexing_status': 'FINISHED',
     'crawled': True,
     'last_modification_date': 1539423166133,
     'homepage': 'http://www.hankjobenha

In [38]:
# EXAMPLE QUERY 2: All pages containing a given expression
search_query = "funny"
es.search(
    index=settings['es_index'],
    body={
        "query": {
            "bool": {
              "must": [
                { "match": { "type": "page" } },
                { "match": { "text": search_query } }
              ]
            }
        },
        "size": 3 # Limit to 3 for convenience
    }
)

{'took': 1,
 'timed_out': False,
 '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
 'hits': {'total': 5,
  'max_score': 7.265485,
  'hits': [{'_index': 'nordic-design',
    '_type': 'doc',
    '_id': 's:https|h:se|h:nola|p:products|p:organic|p:|',
    '_score': 7.265485,
    '_source': {'status': 200,
     'webentity_status': 'IN',
     'encoding': 'utf-8',
     'url': 'https://nola.se/products/organic/',
     'timestamp': 1539425542809,
     'forgotten': False,
     'text': 'Som ett komplement till vårt eget sortiment marknadsför vi produkter från italienska\xa0Bellitalia, som liksom Nola arbetar för god\xa0design i det offentliga uterummet. Bellitalia är verksamma inom street och urban landscaping och utmärker sig med organiska former och spännande material såsom\xa0betong, marmor och polyuretan. Produkterna signalerar kvalitet, funktionalitet och\xa0innovativ\xa0design. Möbler utformade för att passa in i\xa0den levande staden. Serien Organic, design\xa0Matouš Ho

In [40]:
# EXAMPLE QUERY 3: List web entities containing a given expression
search_query = "funny"
es.search(
    index=settings['es_index'],
    body={
        "query": {
            "bool": {
              "must": [
                { "match": { "type": "page" } },
                { "match": { "text": search_query } }
              ]
            }
        },
        "size": 0,
        "aggs": {
            "group_by_webentity": {
                "terms": {
                    "field": "webentity"
                }
            }
        }
    }
)

{'took': 2,
 'timed_out': False,
 '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
 'hits': {'total': 5, 'max_score': 0.0, 'hits': []},
 'aggregations': {'group_by_webentity': {'doc_count_error_upper_bound': 0,
   'sum_other_doc_count': 0,
   'buckets': [{'key': 14, 'doc_count': 1},
    {'key': 99, 'doc_count': 1},
    {'key': 129, 'doc_count': 1},
    {'key': 158, 'doc_count': 1},
    {'key': 205, 'doc_count': 1}]}}}

In [64]:
# FUNCTIONS

# Count how many web entities contain a list of different expressions.
# Note: only counts when those expressions appear in the same page(s).
def count_webentities_containing(expressions):
    result = es.search(
        index=settings['es_index'],
        body={
            "query": {
                "bool": {
                  "must": [{ "match": { "type": "page" } }] + [{ "match": { "text": e } } for e in expressions]
                }
            },
            "size": 0,
            "aggs": {
                "group_by_webentity": {
                    "terms": {
                        "field": "webentity"
                    }
                }
            }
        }
    )
    return len(result['aggregations']['group_by_webentity']['buckets'])

# Usage:
# count_webentities_containing(['funny'])

# Count how many web entities contain a list of different list of alternative expressions.
# Note: only counts when those expressions appear in the same page(s).
def count_webentities_containing_alt(alt_expressions):
    result = es.search(
        index=settings['es_index'],
        body={
            "query": {
                "bool": {
                  "must": [{ "match": { "type": "page" } }] + [{"bool": {"should": [{ "match": { "text": e } } for e in expressions]}} for expressions in alt_expressions]
                }
            },
            "size": 0,
            "aggs": {
                "group_by_webentity": {
                    "terms": {
                        "field": "webentity"
                    }
                }
            }
        }
    )
    return len(result['aggregations']['group_by_webentity']['buckets'])

# Usage:
# count_webentities_containing_alt([['happy', 'merry'], ['christmas', 'xmas']])

In [61]:
es.search(
        index=settings['es_index'],
        body={
            "query": {
                "bool": {
                  "must": [
                      { "match": { "type": "page" } },
                      {
                          "bool": {
                              "should": [
                                  { "match": { "text": "funny" }},
                                  { "match": { "text": "sad" }}
                              ]
                          }
                      }
                  ]
                }
            },
            "size": 0,
            "aggs": {
                "group_by_webentity": {
                    "terms": {
                        "field": "webentity"
                    }
                }
            }
        }
    )

{'took': 6,
 'timed_out': False,
 '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
 'hits': {'total': 18, 'max_score': 0.0, 'hits': []},
 'aggregations': {'group_by_webentity': {'doc_count_error_upper_bound': 0,
   'sum_other_doc_count': 3,
   'buckets': [{'key': 44, 'doc_count': 3},
    {'key': 156, 'doc_count': 3},
    {'key': 55, 'doc_count': 2},
    {'key': 14, 'doc_count': 1},
    {'key': 58, 'doc_count': 1},
    {'key': 59, 'doc_count': 1},
    {'key': 60, 'doc_count': 1},
    {'key': 67, 'doc_count': 1},
    {'key': 99, 'doc_count': 1},
    {'key': 107, 'doc_count': 1}]}}}