In [2]:
'''
This notebook is based on preprocess python notebook, It performs the same functionality but using es mapping

Here we will be using analyzers, filters, char_filter to produce the same effect
'''

'\nThis notebook is based on preprocess python notebook, It performs the same functionality but using es mapping\n\nHere we will be using analyzers, filters, char_filter to produce the same effect\n'

Data can be preprocessed at two phases
* Indexing phase
* Searching phase

##### Indexing phase

When new docs are inserted into a index, this is called indexing phase

##### Searching phase

When any query is searched in a index, this is called searching phase

For both the phases, we use custom analyzers which helps us in preprocessing the data. Analyzers in es consists of the following

* Char Filters
* Tokenizer
* Filters

A stream of text or text is initially passed to the char filter, which works at character level and performs any replacement/insertion/deletion, etc. Common examples can be replacing numbers with their string version, removing punctuation symbols, etc. After this stream of text is passed to tokenizer which tokenizes into tokens. Most common tokenizer is space tokenizer. After this, once we have the tokens, filters come into existence. They apply at the token level.

In [38]:
import json

from os.path import join

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

In [39]:
ES_HOST = 'http://localhost'
ES_PORT = 9202
ES_HOST_STRING = '{}:{}'.format(ES_HOST, ES_PORT)

INDEX_NAME = 'movies_v3'
TYPE_NAME = 'marvel'

In [40]:
# we will first define settings for index and then at last create index
settings = {
    "settings": {
        "index": {
            "analysis": {
                "filter": {},
                "analyzer": {},
                "char_filter": {}
            }
        }
    }
}

In [41]:
# lets first create a custom analyzer, so that we can add custom filter/char filters etc
movie_analyzer_v1 = {
    "filter": [],
    "char_filter": [],
    "type": "custom",
    "tokenizer": "standard"
}

In [42]:
# lets add support for lowercase
# lowercase exists as an inbuilt filter `lowercase`
movie_analyzer_v1['filter'].append('lowercase')
print json.dumps(movie_analyzer_v1, indent=2)

{
  "filter": [
    "lowercase"
  ], 
  "char_filter": [], 
  "type": "custom", 
  "tokenizer": "standard"
}


In [43]:
# lets add support to remove html tags from text
# `html_strip` exists as an inbuilt char filter
movie_analyzer_v1['char_filter'].append('html_strip')
print json.dumps(movie_analyzer_v1, indent=2)

{
  "filter": [
    "lowercase"
  ], 
  "char_filter": [
    "html_strip"
  ], 
  "type": "custom", 
  "tokenizer": "standard"
}


In [44]:
# at this point if we pass any text through `movie_analyzer` it will be lowercase and does not contain any html tags

In [45]:
# lets remove punctuation symbols
# for this there is no inbuild support in ES. we will have to create a custom `char_filter` for this
remove_punctuation = {
    'type': 'mapping',
    'mappings': [
        "! =>",
        "# =>",
        "$ =>",
        "% =>",
        "& =>",
        "' =>",
        "( =>",
        ") =>",
        "* =>",
        "+ =>",
        ", =>",
        "- =>",
        ". =>",
        "/ =>",
        ": =>",
        "; =>",
        "< =>",
        "= =>",
        "> =>",
        "? =>",
        "@ =>",
        "[ =>",
        "] =>",
        "^ =>",
        "_ =>",
        "` =>",
        "{ =>",
        "| =>",
        "} =>",
        "~ =>",
      ]
}
# add this in settings, so it can be used by any analyzer

settings['settings']['index']['analysis']['char_filter']['remove_punctuation'] = remove_punctuation

In [46]:
# add this to `movie_analyzer`
movie_analyzer_v1['char_filter'].append('remove_punctuation')
print json.dumps(movie_analyzer_v1, indent=2)

{
  "filter": [
    "lowercase"
  ], 
  "char_filter": [
    "html_strip", 
    "remove_punctuation"
  ], 
  "type": "custom", 
  "tokenizer": "standard"
}


In [47]:
# lets create a mapping at this point and see if our changes are correctly applied or not
settings['settings']['index']['analysis']['movie_analyzer_v1'] = movie_analyzer_v1

In [48]:
mappings = {
  "mappings": {
    "review": {
      "properties": {
        "content": {
          "type": "text"
        },
        "id": {
          "type": "long"
        },
        "sourceUrl": {
          "type": "keyword"
        }
      }
    }
  }
}

In [49]:
es = Elasticsearch([ES_HOST_STRING])

In [50]:
index_body = {}
index_body['settings'] = settings['settings']
index_body['mappings'] = mappings['mappings']
print json.dumps(index_body, indent=2)

{
  "mappings": {
    "review": {
      "properties": {
        "content": {
          "type": "text"
        }, 
        "sourceUrl": {
          "type": "keyword"
        }, 
        "id": {
          "type": "long"
        }
      }
    }
  }, 
  "settings": {
    "index": {
      "analysis": {
        "filter": {}, 
        "char_filter": {
          "remove_punctuation": {
            "type": "mapping", 
            "mappings": [
              "! =>", 
              "# =>", 
              "$ =>", 
              "% =>", 
              "& =>", 
              "' =>", 
              "( =>", 
              ") =>", 
              "* =>", 
              "+ =>", 
              ", =>", 
              "- =>", 
              ". =>", 
              "/ =>", 
              ": =>", 
              "; =>", 
              "< =>", 
              "= =>", 
              "> =>", 
              "? =>", 
              "@ =>", 
              "[ =>", 
              "] =>", 
              "^ =>", 
         

In [51]:
es.indices.create(index=INDEX_NAME, ignore=400, body=index_body)

{u'acknowledged': True, u'index': u'movies_v3', u'shards_acknowledged': True}

In [52]:
# lets run `analyze` api to see the result

In [53]:
# lets consider review 1 (id 1)
review_1_text = '''Moments that touch the heart are <strong>few</strong> and <strong>far</strong> between in this almost-culmination of a decade of Marvel Comics movies. '''

In [76]:
analyze_query = {
    'filter': ['lowercase'],
    'text': review_1_text
}

In [77]:
response = es.indices.analyze(index=INDEX_NAME, body=analyze_query)
print response['tokens'][0]['token']

moments that touch the heart are <strong>few</strong> and <strong>far</strong> between in this almost-culmination of a decade of marvel comics movies. 


In [78]:
# notice that the text is now in lower case

In [79]:
# lets remove html tags from review 1
analyze_query = {
    'tokenizer': 'standard', 
    'char_filter' : ['html_strip'],
    'text': review_1_text
}

In [80]:
response = es.indices.analyze(index=INDEX_NAME, body=analyze_query)
for token_obj in response['tokens']:
    print token_obj.get('token'),

Moments that touch the heart are few and far between in this almost culmination of a decade of Marvel Comics movies


In [81]:
# lets remove punctuation symbols
# lets consider review 3
review_3_text = '''?Avengers: Infinity War? takes you places that most superhero movies don?t ? and where you may not want to go.'''

In [82]:
analyze_query = {
    'char_filter': ['remove_punctuation'],
    'text': review_3_text
}

In [83]:
response = es.indices.analyze(index=INDEX_NAME, body=analyze_query)
print response['tokens'][0]['token']

Avengers Infinity War takes you places that most superhero movies dont  and where you may not want to go


In [None]:
# notice how the punctuation symbols are removed.