In this walkthrough we will build a full text search capable application by incrementally building the features to be exposed via a simple REST API.

## Install Prerequisites

In [1]:
! pip install "pymongo[srv]"


You should consider upgrading via the '/Users/tom.mccarthy/Projects/tommcc-demos/.venv/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [2]:
# Just making sure everything prints cleanly

from pygments.style import Style
from pygments.token import Token
from pygments import highlight
from pygments.lexers import JsonLexer
from pygments.formatters import Terminal256Formatter


class MyStyle(Style):
    styles = {
        Token.String: 'ansigreen',
        Token.Literal: 'ansibrightyellow',
        Token.Keyword: 'ansimagenta',
        Token.Operator: 'ansibrightmagenta'
    }

def pp(doc):
    formatted_json = json.dumps(json.loads(json_util.dumps(doc)), indent=4)
    colorful_json = highlight(formatted_json, JsonLexer(), Terminal256Formatter(style=MyStyle))
    print(colorful_json)

Add parent to sys path to load config

In [3]:
import sys
import os

sys.path.append("..")

In [4]:
import pymongo
from bson import json_util
from config import mongo_uri
import json
    
conn = pymongo.MongoClient(mongo_uri)

movies_collection = conn['sample_mflix']['movies']

## Basic Search

Run a simple text search.

In [5]:
pipeline = [
    {
        '$search': {
            'text': {
                'query': "fight",
                'path': "title"
            }
        }
    },
    {
        '$project': {
            'title':1,
            '_id':0,
            'score': {
                '$meta': 'searchScore'
            }
        }
    }
]
docs = movies_collection.aggregate(pipeline)
pp(docs)

[
    {
        "title": [32m"Fighting"[39m,
        "score": [93m4.2303690910339355[39m
    },
    {
        "title": [32m"Fight, Zatoichi, Fight"[39m,
        "score": [93m4.106432914733887[39m
    },
    {
        "title": [32m"Fight, Zatoichi, Fight"[39m,
        "score": [93m4.106432914733887[39m
    },
    {
        "title": [32m"The Fighting Lady"[39m,
        "score": [93m3.4211220741271973[39m
    },
    {
        "title": [32m"Fight Club"[39m,
        "score": [93m3.4211220741271973[39m
    },
    {
        "title": [32m"The Fighting Temptations"[39m,
        "score": [93m3.4211220741271973[39m
    },
    {
        "title": [32m"Street Fight"[39m,
        "score": [93m3.4211220741271973[39m
    },
    {
        "title": [32m"Girl Fight"[39m,
        "score": [93m3.4211220741271973[39m
    },
    {
        "title": [32m"A Fighting Man"[39m,
        "score": [93m3.4211220741271973[39m
    },
    {
        "title": [32m"Fight Back to School"

## Fuzzy

Often referred to as approximate string matching, fuzzy matching is a technique of finding strings that match a pattern approximately rather than exactly. It's common use case is when there are common mispellings, when users make errors ("fat fingering"), etc.

**maxEdits** uses the Levenshtein distance, which is the difference between two string sequences.

In [6]:
pipeline = [
    {
        '$search': {
            'text': {
                'query': "might cub",
                'path': "title",
                'fuzzy':{
                    'maxEdits':2
                }
            }
        }
    },
    {
        '$project': {
            'title':1,
            '_id':0,
            'score': {
                '$meta': 'searchScore'
            }
        }
    }
]
docs = movies_collection.aggregate(pipeline)
pp(docs)


[
    {
        "title": [32m"Fight Club"[39m,
        "score": [93m3.7854979038238525[39m
    },
    {
        "title": [32m"Cub"[39m,
        "score": [93m3.6564555168151855[39m
    },
    {
        "title": [32m"Knight of Cups"[39m,
        "score": [93m3.331955909729004[39m
    },
    {
        "title": [32m"Fright Night"[39m,
        "score": [93m3.1747939586639404[39m
    },
    {
        "title": [32m"Fright Night"[39m,
        "score": [93m3.1747939586639404[39m
    },
    {
        "title": [32m"Eight Crazy Nights"[39m,
        "score": [93m3.045708179473877[39m
    },
    {
        "title": [32m"Friday Night Lights"[39m,
        "score": [93m3.045708179473877[39m
    },
    {
        "title": [32m"The High and the Mighty"[39m,
        "score": [93m2.9480228424072266[39m
    },
    {
        "title": [32m"Monster High: Friday Night Frights"[39m,
        "score": [93m2.7375903129577637[39m
    },
    {
        "title": [32m"Fight for Your Ri

## Highlighting

Add a relevance score and hit highlights to the results.

In [7]:
pipeline = [
        {
            '$search': {
                'text': {
                    'query': "fight",
                    'path': "title"
                },
                # text highlighting
                'highlight': { "path": "title" }
            }
        }, {
            '$project': {
                'title':1,
                '_id':0,
                'score': {
                    '$meta': 'searchScore'
                },
                'highlights': {"$meta": "searchHighlights"},
                'score': {
                    '$meta': 'searchScore'
                }
            }
        }
    ]

docs = movies_collection.aggregate(pipeline)
pp(docs)

[
    {
        "title": [32m"Fighting"[39m,
        "score": [93m4.2303690910339355[39m,
        "highlights": [
            {
                "score": [93m1.408543348312378[39m,
                "path": [32m"title"[39m,
                "texts": [
                    {
                        "value": [32m"Fighting"[39m,
                        "type": [32m"hit"[39m
                    }
                ]
            }
        ]
    },
    {
        "title": [32m"Fight, Zatoichi, Fight"[39m,
        "score": [93m4.106432914733887[39m,
        "highlights": [
            {
                "score": [93m1.1319897174835205[39m,
                "path": [32m"title"[39m,
                "texts": [
                    {
                        "value": [32m"Fight"[39m,
                        "type": [32m"hit"[39m
                    },
                    {
                        "value": [32m", Zatoichi, "[39m,
                        "type": [32m"text"[39m
     

## Autocomplete

Edit the "query" field to try different searches

In [8]:
pipeline = [
    {
        '$search': {
            'autocomplete': {
                'query': "fl",
                'path': "title"
            }
        }
    },
    {
        '$project': {
            'title': 1,
            'score': {
                    '$meta': 'searchScore'
                },
            '_id': 0
        }
    },
    {
        '$limit': 5
    },
    {
        '$sort': {'score': -1}
    }
]

docs = movies_collection.aggregate(pipeline)
pp(docs)

[
    {
        "title": [32m"The Flim-Flam Man"[39m,
        "score": [93m4.277243614196777[39m
    },
    {
        "title": [32m"Flower Drum Song"[39m,
        "score": [93m4.10808801651001[39m
    },
    {
        "title": [32m"Flatfoot in Africa"[39m,
        "score": [93m4.10808801651001[39m
    },
    {
        "title": [32m"Flodder in Amerika!"[39m,
        "score": [93m4.10808801651001[39m
    },
    {
        "title": [32m"Flesh and Bone"[39m,
        "score": [93m4.10808801651001[39m
    }
]



# Compound

Compound queries grant the ability to add multiple conditions to your search query. Each element is a clause, where you can embed sub-queries.

In [10]:
pipeline = [
    {
        "$search": {
            "compound": {
                "must": [{"text": {"query": "Fight", "path": "title"}}],
                "mustNot": [
                    {"text": {"query": "Zatoichi", "path": "title"}},
                    {"text": {"query": "Club", "path": "title"}},
                ],
            }
        }
    },
    {"$project": {"title": 1, "score": {"$meta": "searchScore"}, "_id": 0}},
    {"$limit": 5},
    {"$sort": {"score": -1}},
]

docs = movies_collection.aggregate(pipeline)
pp(docs)

[
    {
        "title": [32m"Fighting"[39m,
        "score": [93m4.2303690910339355[39m
    },
    {
        "title": [32m"The Fighting Lady"[39m,
        "score": [93m3.4211220741271973[39m
    },
    {
        "title": [32m"The Fighting Temptations"[39m,
        "score": [93m3.4211220741271973[39m
    },
    {
        "title": [32m"Street Fight"[39m,
        "score": [93m3.4211220741271973[39m
    },
    {
        "title": [32m"Girl Fight"[39m,
        "score": [93m3.4211220741271973[39m
    }
]



# Facets

In [11]:
from dateutil import parser

pipeline = [
 {
   "$searchMeta": {
     "count": {"type": "total"},
     "facet": {
       "operator": {
         "range": {
           "path": "released",
           "gte": parser.parse("2000-01-01T00:00:00.000Z"),
           "lte": parser.parse("2015-01-31T00:00:00.000Z")
         }
       },
       "facets": {
         "directorsFacet": {
           "type": "string",
           "path": "directors",
           "numBuckets" : 5
         },
         "yearFacet" : {
           "type" : "number",
           "path" : "year",
           "boundaries" : [2000,2005,2010,2015]
         },
         "genresFacet": {
            "type": "string",
            "path": "genres",
            "numBuckets": 5
          }
       }
     }
   }
 }
]

docs = movies_collection.aggregate(pipeline)
pp(docs)

[
    {
        "count": {
            "total": [93m13064[39m
        },
        "facet": {
            "genresFacet": {
                "buckets": [
                    {
                        "_id": [32m"Drama"[39m,
                        "count": [93m7387[39m
                    },
                    {
                        "_id": [32m"Comedy"[39m,
                        "count": [93m3795[39m
                    },
                    {
                        "_id": [32m"Romance"[39m,
                        "count": [93m1863[39m
                    },
                    {
                        "_id": [32m"Thriller"[39m,
                        "count": [93m1592[39m
                    },
                    {
                        "_id": [32m"Documentary"[39m,
                        "count": [93m1573[39m
                    }
                ]
            },
            "yearFacet": {
                "buckets": [
                    {
           

# Synonyms

Requires index set up with synonyms for title field. 

In [12]:
pipeline = [
  {
    '$search': {
      'text': {
        'path': 'title',
        'query': 'automobile',
        'synonyms': 'transportSynonyms'
      }
    }
  }, 
  {
    '$limit': 10
  }, 
  {
    '$project': {
      '_id': 0,
      'title': 1,
      'score': {
        '$meta': 'searchScore'
      }
    }
  }
]

docs = movies_collection.aggregate(pipeline)
pp(docs)

[
    {
        "title": [32m"Cars"[39m,
        "score": [93m4.197734832763672[39m
    },
    {
        "title": [32m"Planes, Trains & Automobiles"[39m,
        "score": [93m3.8511905670166016[39m
    },
    {
        "title": [32m"Car Wash"[39m,
        "score": [93m3.39473032951355[39m
    },
    {
        "title": [32m"Used Cars"[39m,
        "score": [93m3.39473032951355[39m
    },
    {
        "title": [32m"Blue Car"[39m,
        "score": [93m3.39473032951355[39m
    },
    {
        "title": [32m"Cars 2"[39m,
        "score": [93m3.39473032951355[39m
    },
    {
        "title": [32m"Stealing Cars"[39m,
        "score": [93m3.39473032951355[39m
    },
    {
        "title": [32m"Cop Car"[39m,
        "score": [93m3.39473032951355[39m
    },
    {
        "title": [32m"The Cars That Eat People"[39m,
        "score": [93m2.8496146202087402[39m
    },
    {
        "title": [32m"Khrustalyov, My Car!"[39m,
        "score": [93m2.8496146202087

In [13]:
pipeline = [
    {
        '$search': {
            'compound': {
                'should': [
                    {
                        'text': {
                            'path': 'title',
                            'query': 'boat',
                            'synonyms': 'transportSynonyms'
                        }
                    }, {
                        'text': {
                            'path': 'title',
                            'query': 'hat',
                            'synonyms': 'attireSynonyms'
                        }
                    }
                ]
            }
        }
    }, {
        '$limit': 10
    }, {
        '$project': {
            '_id': 0,
            'title': 1,
            'score': {
                '$meta': 'searchScore'
            }
        }
    }
]

docs = movies_collection.aggregate(pipeline)
pp(docs)

[
    {
        "title": [32m"Fedora"[39m,
        "score": [93m5.673145294189453[39m
    },
    {
        "title": [32m"Vessel"[39m,
        "score": [93m5.373150825500488[39m
    },
    {
        "title": [32m"Boats"[39m,
        "score": [93m4.589139938354492[39m
    },
    {
        "title": [32m"And the Ship Sails On"[39m,
        "score": [93m4.3452959060668945[39m
    },
    {
        "title": [32m"Broken Vessels"[39m,
        "score": [93m4.3452959060668945[39m
    },
    {
        "title": [32m"Sailing to Paradise"[39m,
        "score": [93m4.3452959060668945[39m
    },
    {
        "title": [32m"Top Hat"[39m,
        "score": [93m4.066137313842773[39m
    },
    {
        "title": [32m"A Hatful of Rain"[39m,
        "score": [93m4.066137313842773[39m
    },
    {
        "title": [32m"Boat People"[39m,
        "score": [93m3.711261749267578[39m
    },
    {
        "title": [32m"Boat Trip"[39m,
        "score": [93m3.711261749267578[39

In [42]:
from bson import ObjectId

pipeline = [
    {"$match": {"title": "The Godfather"}},
    {"$project": {"genres": 1, "plot": 1}},
]

doc = list(movies_collection.aggregate(pipeline))[0]

pipeline = [
    {
        "$search": {
            "compound": {
                "must": [{"moreLikeThis": {"like": doc}}],
                "mustNot": [
                    {
                        "equals": {
                            "path": "_id",
                            "value": doc["_id"],
                        }
                    }
                ],
            }
        }
    },
    {"$project": {"score": {"$meta": "searchScore"}, "title": 1}},
]


docs = movies_collection.aggregate(pipeline)

pp(docs)

[
    {
        "_id": {
            "$oid": [32m"573a13b1f29313caabd36962"[39m
        },
        "title": [32m"Romanzo Criminale"[39m,
        "score": [93m7.608759880065918[39m
    },
    {
        "_id": {
            "$oid": [32m"573a1398f29313caabce92e8"[39m
        },
        "title": [32m"Night Patrol"[39m,
        "score": [93m6.961604595184326[39m
    },
    {
        "_id": {
            "$oid": [32m"573a1398f29313caabce9d9c"[39m
        },
        "title": [32m"Anguish"[39m,
        "score": [93m6.427550315856934[39m
    },
    {
        "_id": {
            "$oid": [32m"573a13b5f29313caabd4397a"[39m
        },
        "title": [32m"Curse of the Golden Flower"[39m,
        "score": [93m6.113086700439453[39m
    },
    {
        "_id": {
            "$oid": [32m"573a13ecf29313caabdd1e08"[39m
        },
        "title": [32m"The White Haired Witch of Lunar Kingdom"[39m,
        "score": [93m5.955801963806152[39m
    },
    {
        "_id": {
    

# More like this

In [14]:
# conn.close()