In [1]:
import requests
from pprint import pprint

In [2]:
ES_URI = "http://localhost:9200/my_library"

standard_analyzer_settings = {
    "settings": {
        "analysis": {
            "analyzer": {
                "standard_clone": {
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "stop"
                    ]
                }
            }
        }
    }
}

In [3]:
STRANGELOVE = "Dr. Strangelove: Or How I Learned To Stop Worrying and Love the Bomb"
FLOWER = "flower flowers flowering flowered"

def create_index(settings):
    result = requests.put(ES_URI, json=settings)
    print("Index created", result.json())

def remove_index():
    result = requests.delete(ES_URI)
    print("Index removed", result.json())
    
def analyze_query(analyzer, text):
    query = {
        "text": text,
        "analyzer": analyzer
    }
    result = requests.get(f"{ES_URI}/_analyze", json=query)
    pprint(result.json())
    

In [4]:
remove_index()

Index removed {'acknowledged': True}


In [5]:
create_index(standard_analyzer_settings)

Index created {'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_library'}


In [6]:
analyze_query("standard_clone", STRANGELOVE)

{'tokens': [{'end_offset': 2,
             'position': 0,
             'start_offset': 0,
             'token': 'dr',
             'type': '<ALPHANUM>'},
            {'end_offset': 15,
             'position': 1,
             'start_offset': 4,
             'token': 'strangelove',
             'type': '<ALPHANUM>'},
            {'end_offset': 23,
             'position': 3,
             'start_offset': 20,
             'token': 'how',
             'type': '<ALPHANUM>'},
            {'end_offset': 25,
             'position': 4,
             'start_offset': 24,
             'token': 'i',
             'type': '<ALPHANUM>'},
            {'end_offset': 33,
             'position': 5,
             'start_offset': 26,
             'token': 'learned',
             'type': '<ALPHANUM>'},
            {'end_offset': 41,
             'position': 7,
             'start_offset': 37,
             'token': 'stop',
             'type': '<ALPHANUM>'},
            {'end_offset': 50,
             'positi

In [7]:
remove_index()

Index removed {'acknowledged': True}


In [8]:
english_analyzer_settings = {
    "settings": {
        "analysis": {
            "filter": {
                "english_stop": {
                    "type": "stop",
                    "stopwords": "_english_"
                },
                "english_keywords": {
                    "type": "keyword_marker",
                    "keywords": ["example"]
                },
                "english_stemmer": {
                    "type": "stemmer",
                    "language": "english"
                },
                "english_possesive_stemmer": {
                    "type": "stemmer",
                    "language": "possessive_english"
                }
            },
            "analyzer": {
                "english_clone": {
                    "tokenizer": "standard",
                    "filter": [
                        "english_possesive_stemmer",
                        "lowercase",
                        "english_stop",
                        "english_keywords",
                        "english_stemmer"
                    ]
                }
            }
        }
    }
}

In [9]:
create_index(english_analyzer_settings)

Index created {'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_library'}


In [10]:
analyze_query("english_clone", STRANGELOVE)

{'tokens': [{'end_offset': 2,
             'position': 0,
             'start_offset': 0,
             'token': 'dr',
             'type': '<ALPHANUM>'},
            {'end_offset': 15,
             'position': 1,
             'start_offset': 4,
             'token': 'strangelov',
             'type': '<ALPHANUM>'},
            {'end_offset': 23,
             'position': 3,
             'start_offset': 20,
             'token': 'how',
             'type': '<ALPHANUM>'},
            {'end_offset': 25,
             'position': 4,
             'start_offset': 24,
             'token': 'i',
             'type': '<ALPHANUM>'},
            {'end_offset': 33,
             'position': 5,
             'start_offset': 26,
             'token': 'learn',
             'type': '<ALPHANUM>'},
            {'end_offset': 41,
             'position': 7,
             'start_offset': 37,
             'token': 'stop',
             'type': '<ALPHANUM>'},
            {'end_offset': 50,
             'position'

In [11]:
analyze_query("english_clone", FLOWER)

{'tokens': [{'end_offset': 6,
             'position': 0,
             'start_offset': 0,
             'token': 'flower',
             'type': '<ALPHANUM>'},
            {'end_offset': 14,
             'position': 1,
             'start_offset': 7,
             'token': 'flower',
             'type': '<ALPHANUM>'},
            {'end_offset': 24,
             'position': 2,
             'start_offset': 15,
             'token': 'flower',
             'type': '<ALPHANUM>'},
            {'end_offset': 33,
             'position': 3,
             'start_offset': 25,
             'token': 'flower',
             'type': '<ALPHANUM>'}]}


In [12]:
ALMOST_STRANGELOVE = "mr. weirdlove:  don't worry I'm learning to star loving bombs"
analyze_query("english_clone", ALMOST_STRANGELOVE)

{'tokens': [{'end_offset': 2,
             'position': 0,
             'start_offset': 0,
             'token': 'mr',
             'type': '<ALPHANUM>'},
            {'end_offset': 13,
             'position': 1,
             'start_offset': 4,
             'token': 'weirdlov',
             'type': '<ALPHANUM>'},
            {'end_offset': 21,
             'position': 2,
             'start_offset': 16,
             'token': "don't",
             'type': '<ALPHANUM>'},
            {'end_offset': 27,
             'position': 3,
             'start_offset': 22,
             'token': 'worri',
             'type': '<ALPHANUM>'},
            {'end_offset': 31,
             'position': 4,
             'start_offset': 28,
             'token': "i'm",
             'type': '<ALPHANUM>'},
            {'end_offset': 40,
             'position': 5,
             'start_offset': 32,
             'token': 'learn',
             'type': '<ALPHANUM>'},
            {'end_offset': 48,
             'positi

In [13]:
phonetic_analyzer_settings = {
    "settings": {
        "analysis": {
            "analyzer": {
                "phonetic": {
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "my_doublemetaphone"
                    ]
                }
            },
            "filter": {
                "my_doublemetaphone": {
                    "type": "phonetic",
                    "encoder": "doublemetaphone",
                    "replace": True
                }
            }
        }       
    }
}

In [14]:
remove_index()
create_index(phonetic_analyzer_settings)

Index removed {'acknowledged': True}
Index created {'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_library'}


In [15]:
analyze_query("phonetic", "Message from Dalai Lama")

{'tokens': [{'end_offset': 7,
             'position': 0,
             'start_offset': 0,
             'token': 'MSJ',
             'type': '<ALPHANUM>'},
            {'end_offset': 7,
             'position': 0,
             'start_offset': 0,
             'token': 'MSK',
             'type': '<ALPHANUM>'},
            {'end_offset': 12,
             'position': 1,
             'start_offset': 8,
             'token': 'FRM',
             'type': '<ALPHANUM>'},
            {'end_offset': 18,
             'position': 2,
             'start_offset': 13,
             'token': 'TL',
             'type': '<ALPHANUM>'},
            {'end_offset': 23,
             'position': 3,
             'start_offset': 19,
             'token': 'LM',
             'type': '<ALPHANUM>'}]}


In [16]:
analyze_query("phonetic", "Message from tall llama")

{'tokens': [{'end_offset': 7,
             'position': 0,
             'start_offset': 0,
             'token': 'MSJ',
             'type': '<ALPHANUM>'},
            {'end_offset': 7,
             'position': 0,
             'start_offset': 0,
             'token': 'MSK',
             'type': '<ALPHANUM>'},
            {'end_offset': 12,
             'position': 1,
             'start_offset': 8,
             'token': 'FRM',
             'type': '<ALPHANUM>'},
            {'end_offset': 17,
             'position': 2,
             'start_offset': 13,
             'token': 'TL',
             'type': '<ALPHANUM>'},
            {'end_offset': 23,
             'position': 3,
             'start_offset': 18,
             'token': 'LM',
             'type': '<ALPHANUM>'}]}


### 4.3

In [17]:
remove_index()

Index removed {'acknowledged': True}


In [18]:
create_index({"settings": {"number_of_shards": 1}})

Index created {'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_library'}


In [19]:
def index_document(id, doc):
    result = requests.put(f"{ES_URI}/_doc/{id}", json=doc)
    print("Indexed", result.json())

In [20]:
index_document(1, {"title": "apple apple apple apple apple"})

Indexed {'_index': 'my_library', '_type': '_doc', '_id': '1', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}


In [21]:
index_document(2, {"title": "apple apple apple banana banana"})

Indexed {'_index': 'my_library', '_type': '_doc', '_id': '2', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1}


In [22]:
index_document(3, {"title": "apple banana blueberry coconut"})

Indexed {'_index': 'my_library', '_type': '_doc', '_id': '3', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 2, '_primary_term': 1}


In [23]:
def search(title, explain=False):
    return requests.get(
        f"{ES_URI}/_doc/_search",
        json={"explain": explain, "query": {"match": {"title": title}}}
    )

In [24]:
search("apple", True).json()

{'took': 15,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 3, 'relation': 'eq'},
  'max_score': 0.2344793,
  'hits': [{'_shard': '[my_library][0]',
    '_node': 'fIdV0MKrS6SyS4ccGT0m1Q',
    '_index': 'my_library',
    '_type': '_doc',
    '_id': '1',
    '_score': 0.2344793,
    '_source': {'title': 'apple apple apple apple apple'},
    '_explanation': {'value': 0.2344793,
     'description': 'weight(title:apple in 0) [PerFieldSimilarity], result of:',
     'details': [{'value': 0.2344793,
       'description': 'score(freq=5.0), computed as boost * idf * tf from:',
       'details': [{'value': 2.2, 'description': 'boost', 'details': []},
        {'value': 0.13353139,
         'description': 'idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:',
         'details': [{'value': 3,
           'description': 'n, number of documents containing term',
           'details': []},
          {'value': 3,
           '

In [25]:
index_document(4, {"title": "apples apple"})

Indexed {'_index': 'my_library', '_type': '_doc', '_id': '4', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 3, '_primary_term': 1}


In [26]:
search("apple apple", False).json()

{'took': 6,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 4, 'relation': 'eq'},
  'max_score': 0.36076754,
  'hits': [{'_index': 'my_library',
    '_type': '_doc',
    '_id': '1',
    '_score': 0.36076754,
    '_source': {'title': 'apple apple apple apple apple'}},
   {'_index': 'my_library',
    '_type': '_doc',
    '_id': '2',
    '_score': 0.31429577,
    '_source': {'title': 'apple apple apple banana banana'}},
   {'_index': 'my_library',
    '_type': '_doc',
    '_id': '4',
    '_score': 0.26490647,
    '_source': {'title': 'apples apple'}},
   {'_index': 'my_library',
    '_type': '_doc',
    '_id': '3',
    '_score': 0.21072102,
    '_source': {'title': 'apple banana blueberry coconut'}}]}}

In [33]:
remove_index()

Index removed {'acknowledged': True}


In [35]:
create_index(english_analyzer_settings)

Index created {'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_library'}


In [38]:
remove_index()
# appraently both the book example and elastic official docs example: 
# https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html#english-analyzer
# do not solve the problem and regular english analyzer needs to be used
create_index(
    {"settings": {
        "analysis": {
            "analyzer": {
                "default": {
                    "type": "english"
                }
            }
        }
      }
    }
)

Index removed {'acknowledged': True}
Index created {'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_library'}


In [29]:
def index_documents(documents):
    for id, doc in enumerate(documents):
        index_document(id, doc)

In [30]:
docs = [
    {"title": "apple apple apple apple apple"},
    {"title": "apple apple apple banana banana"},
    {"title": "apple banana blueberry coconut"},
    {"title": "apples apple"}
]

In [39]:
index_documents(docs)

Indexed {'_index': 'my_library', '_type': '_doc', '_id': '0', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}
Indexed {'_index': 'my_library', '_type': '_doc', '_id': '1', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1}
Indexed {'_index': 'my_library', '_type': '_doc', '_id': '2', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 2, '_primary_term': 1}
Indexed {'_index': 'my_library', '_type': '_doc', '_id': '3', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 3, '_primary_term': 1}


In [40]:
search("apple", False).json()

{'took': 7,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 4, 'relation': 'eq'},
  'max_score': 0.18038377,
  'hits': [{'_index': 'my_library',
    '_type': '_doc',
    '_id': '0',
    '_score': 0.18038377,
    '_source': {'title': 'apple apple apple apple apple'}},
   {'_index': 'my_library',
    '_type': '_doc',
    '_id': '3',
    '_score': 0.16857684,
    '_source': {'title': 'apples apple'}},
   {'_index': 'my_library',
    '_type': '_doc',
    '_id': '1',
    '_score': 0.15714788,
    '_source': {'title': 'apple apple apple banana banana'}},
   {'_index': 'my_library',
    '_type': '_doc',
    '_id': '2',
    '_score': 0.10536051,
    '_source': {'title': 'apple banana blueberry coconut'}}]}}