# MongoDB note

Topics:

    * Query documents and subdocuments, Counting Documents
    * Survey Distinct Values with filters, element match operator 
    * Filter with Regular Expressions
    * MongoDB Projection
    * Sort search result: post query sort and in-query sort 
    * Indexes in MongoDB
    * Limits and Skips search 
    * aggregation 


Create The Nobel Prize database. Mongodb Server was installed locally.
Load the Nobel prizes and laureates from the Nobel API and store it as a local database. 

In [3]:
import requests
#import the mongoclient class from pymongo 
from pymongo import MongoClient
# Client connects to "localhost" by default
client = MongoClient("mongodb://localhost:27017/")
# Create local "nobel" database on the fly
db = client["nobel"]
# API documented at https://nobelprize.readme.io/docs/prize
for collection_name in ["prizes","laureates"]:
    singular = collection_name[:-1]
    response = requests.get("http://api.nobelprize.org/v1/{}.json".format(singular))
    documents = response.json()[collection_name]
    # Create collections on the fly
    db[collection_name].insert_many(documents)

In [4]:
# You can also access dbs and collections as attributes
assert client.nobel == db
assert db.prizes == db["prizes"]
# Count documents
n_prizes = db.prizes.count_documents({})
n_laureates = db.laureates.count_documents({})
# Find one document to inspect
doc = db.prizes.find_one({})

## Counting Documents, and Finding one

Two way to get access to a collection: use bracket notation or dot notation. This is similar to pandas. A database is like a dictionary of collections with collection name as key.

`.count_document({})` for count documents
`.find_one()` get the JSON of a document
`list_database_names()` on a client instance
`list_collection_names()` list collection names on a database instance.

Put filter message in {}

In [5]:
# You can also access dbs and collections as attributes
assert client.nobel == db
assert db.prizes == db["prizes"]
# Count documents
n_prizes = db.prizes.count_documents({})
n_laureates = db.laureates.count_documents({})
# Find one document to inspect
doc = db.prizes.find_one({})

In [7]:
n_prizes

590

In [6]:
doc

{'_id': ObjectId('5c9fd8a604ac58346c84e6fe'),
 'year': '2018',
 'category': 'physics',
 'overallMotivation': '“for groundbreaking inventions in the field of laser physics”',
 'laureates': [{'id': '960',
   'firstname': 'Arthur',
   'surname': 'Ashkin',
   'motivation': '"for the optical tweezers and their application to biological systems"',
   'share': '2'},
  {'id': '961',
   'firstname': 'Gérard',
   'surname': 'Mourou',
   'motivation': '"for their method of generating high-intensity, ultra-short optical pulses"',
   'share': '4'},
  {'id': '962',
   'firstname': 'Donna',
   'surname': 'Strickland',
   'motivation': '"for their method of generating high-intensity, ultra-short optical pulses"',
   'share': '4'}]}

In [8]:
# Save a list of names of the databases managed by client
db_names = client.list_database_names()
print(db_names)

# Save a list of names of the collections managed by the "nobel" database
nobel_coll_names = client.nobel.list_collection_names()
print(nobel_coll_names)

['admin', 'config', 'local', 'nobel']
['laureates', 'prizes']


List fields and count laureates' prizes

In [9]:
# Connect to the "nobel" database
db = client.nobel

# Retrieve sample prize and laureate documents
prize = db.prizes.find_one()
laureate = db.laureates.find_one()

# Get lists of the fields present in each type of document
prize_fields = list(prize.keys())
laureate_fields = list(laureate.keys())

# Compute the total number of laureate prizes
count = sum(len(doc['prizes']) for doc in db.laureates.find())
print(count)

941


# Query documents and subdocuments

- Operator in MongoDB has '$' prefix.
- MongoDB compare string in alphabetical order
- query subdocuments using dot notation

In [10]:
# find one laureats
db.laureates.count_documents({
 'born': '1845-03-27',
 'diedCountry': 'Germany',
 'gender': 'male',
 'surname': 'Röntgen'
})

1

In [11]:
# find all femal laureats
db.laureates.count_documents({
  'gender': 'female'
})

51

In [13]:
# query a range
db.laureates.count_documents({
 'diedCountry': {
  '$in': ['France', 'USA']
 }})

259

In [14]:
# query not 
db.laureates.count_documents({
 'diedCountry': {
     # not equal
  '$ne': 'France'
 }})


884

In [15]:
db.laureates.count_documents({
 'diedCountry': {
     # greater than
  '$gt': 'Belgium',
     #less than
  '$lte': 'USA'
 }})

455

In [16]:
# Create a filter for laureates who died in the USA
criteria = {'diedCountry': 'USA'}

# Save a count of these laureates
count = db.laureates.count_documents(criteria)
print(count)

209


In [17]:
# Save a filter for laureates born in the USA, Canada, or Mexico
criteria = {'bornCountry': {"$in": ['USA','Canada','Mexico']}}

# Count them and save the count
count = db.laureates.count_documents(criteria)
print(count)

291


In [18]:
db.laureates.find_one({
  "firstname": "Walter",
  "surname": "Kohn"})

{'_id': ObjectId('5c9fd8a804ac58346c84ea68'),
 'id': '290',
 'firstname': 'Walter',
 'surname': 'Kohn',
 'born': '1923-03-09',
 'died': '2016-04-19',
 'bornCountry': 'Austria',
 'bornCountryCode': 'AT',
 'bornCity': 'Vienna',
 'diedCountry': 'USA',
 'diedCountryCode': 'US',
 'diedCity': 'Santa Barbara, CA',
 'gender': 'male',
 'prizes': [{'year': '1998',
   'category': 'chemistry',
   'share': '2',
   'motivation': '"for his development of the density-functional theory"',
   'affiliations': [{'name': 'University of California',
     'city': 'Santa Barbara, CA',
     'country': 'USA'}]}]}

In [19]:
# query subdocuments using dot notation
db.laureates.count_documents({
 "prizes.affiliations.name": (
  "University of California")})

34

In [20]:
db.laureates.count_documents({"bornCountry": {"$exists": False}})

33

In [22]:
db.laureates.count_documents({"prizes": {"$exists": True}})
#acess the first element of the prize field 
db.laureates.count_documents({"prizes.0": {"$exists": True}})
# acess the second element of the prize field 
db.laureates.count_documents({"prizes.1": {"$exists": True}})

6

In [23]:
# Filter for laureates born in Austria with non-Austria prize affiliation
criteria = {'bornCountry': 'Austria', 'prizes.affiliations.country': {"$ne": 'Austria'}}

# Count the number of such laureates
count = db.laureates.count_documents(criteria)
print(count)

10


In [24]:
# Filter for documents without a "born" field
criteria = {'born': {'$exists': False}}
assert db.laureates.count_documents(criteria) == 0

# Filter for laureates with at least three prizes
criteria = {'prizes.2': {'$exists': True}}
print(db.laureates.find_one(criteria))

{'_id': ObjectId('5c9fd8a804ac58346c84eb27'), 'id': '482', 'firstname': 'Comité international de la Croix Rouge (International Committee of the Red Cross)', 'born': '0000-00-00', 'died': '0000-00-00', 'gender': 'org', 'prizes': [{'year': '1917', 'category': 'peace', 'share': '1', 'affiliations': [[]]}, {'year': '1944', 'category': 'peace', 'share': '1', 'affiliations': [[]]}, {'year': '1963', 'category': 'peace', 'share': '2', 'affiliations': [[]]}]}


# Survey Distinct Values

`distinct()` method 

In [28]:
db.laureates.find_one({"prizes.2": {"$exists": True}})

{'_id': ObjectId('5c9fd8a804ac58346c84eb27'),
 'id': '482',
 'firstname': 'Comité international de la Croix Rouge (International Committee of the Red Cross)',
 'born': '0000-00-00',
 'died': '0000-00-00',
 'gender': 'org',
 'prizes': [{'year': '1917',
   'category': 'peace',
   'share': '1',
   'affiliations': [[]]},
  {'year': '1944', 'category': 'peace', 'share': '1', 'affiliations': [[]]},
  {'year': '1963', 'category': 'peace', 'share': '2', 'affiliations': [[]]}]}

In [26]:
# find all unique value in the gender field
db.laureates.distinct("gender")


['male', 'female', 'org']

In [27]:
db.laureates.distinct("prizes.category")

['physics', 'chemistry', 'peace', 'medicine', 'literature', 'economics']

In [29]:
# Countries recorded as countries of death but not as countries of birth
countries = set(db.laureates.distinct('diedCountry')) - set(db.laureates.distinct('bornCountry'))
print(countries)

{'Israel', 'Tunisia', 'East Germany', 'Barbados', 'Yugoslavia (now Serbia)', 'Puerto Rico', 'Northern Rhodesia (now Zambia)', 'Gabon', 'Greece', 'Philippines', 'USSR', 'Czechoslovakia', 'Jamaica'}


In [30]:
# The number of distinct countries of laureate affiliation for prizes
count = len(db.laureates.distinct('prizes.affiliations.country'))
print(count)

29


## Distinct Values Given Filters


In [31]:
# which laureates share prize with 4 people?
db.laureates.find_one({"prizes.share": "4"})

{'_id': ObjectId('5c9fd8a804ac58346c84e950'),
 'id': '5',
 'firstname': 'Pierre',
 'surname': 'Curie',
 'born': '1859-05-15',
 'died': '1906-04-19',
 'bornCountry': 'France',
 'bornCountryCode': 'FR',
 'bornCity': 'Paris',
 'diedCountry': 'France',
 'diedCountryCode': 'FR',
 'diedCity': 'Paris',
 'gender': 'male',
 'prizes': [{'year': '1903',
   'category': 'physics',
   'share': '4',
   'motivation': '"in recognition of the extraordinary services they have rendered by their joint researches on the radiation phenomena discovered by Professor Henri Becquerel"',
   'affiliations': [{'name': 'École municipale de physique et de chimie industrielles (Municipal School of Industrial Physics and Chemistry)',
     'city': 'Paris',
     'country': 'France'}]}]}

In [33]:
# find catergory of the nobel prize, which shared by 4 
db.laureates.distinct(
  "prizes.category", {"prizes.share": '4'})

['physics', 'chemistry', 'medicine']

In [34]:
# find catergory of the prize won't by laureates who win more than once prizes
db.laureates.distinct(
  "prizes.category", {"prizes.1": {"$exists": True}})

['chemistry', 'physics', 'peace']

In [35]:
db.prizes.find_one({})

{'_id': ObjectId('5c9fd8a604ac58346c84e6fe'),
 'year': '2018',
 'category': 'physics',
 'overallMotivation': '“for groundbreaking inventions in the field of laser physics”',
 'laureates': [{'id': '960',
   'firstname': 'Arthur',
   'surname': 'Ashkin',
   'motivation': '"for the optical tweezers and their application to biological systems"',
   'share': '2'},
  {'id': '961',
   'firstname': 'Gérard',
   'surname': 'Mourou',
   'motivation': '"for their method of generating high-intensity, ultra-short optical pulses"',
   'share': '4'},
  {'id': '962',
   'firstname': 'Donna',
   'surname': 'Strickland',
   'motivation': '"for their method of generating high-intensity, ultra-short optical pulses"',
   'share': '4'}]}

In [36]:
# Save a filter for prize documents with three or more laureates
criteria = {'laureates.2': {'$exists': True}}

# Save the set of distinct prize categories in documents satisfying the criteria
triple_play_categories = set(db.prizes.distinct('category', criteria))

# Confirm literature as the only category not satisfying the criteria.
assert set(db.prizes.distinct('category')) - triple_play_categories == {'literature'}

## how to use element match operator 

In [38]:
# count the nobel laureates in physics, which is shared by 1 person
db.laureates.count_documents({
    "prizes": {"$elemMatch": {"category": "physics", "share": "1"}}})

47

In [43]:
db.laureates.count_documents({
    "prizes": {"$elemMatch":{"category":"physics","share":"1","year":{"$gte":"1945"}}}
})

18

In [46]:
db.laureates.count_documents({
    "prizes": {"$elemMatch":{"category":"physics","share":{"$ne":"1"},"year":{"$gte":"1945"}}}
})

143

In [45]:
 #What is this ratio for the ratio of the number of laureates who won an unshared vs share prize?
# Save a filter for laureates with unshared prizes
unshared = {
    "prizes": {"$elemMatch": {
        'category': {"$nin": ["physics", "chemistry", "medicine"]},
        "share": "1",
        "year": {"$gte": "1945"},
    }}}

# Save a filter for laureates with shared prizes
shared = {
    "prizes": {"$elemMatch": {
        'category': {"$nin": ["physics", "chemistry", "medicine"]},
        "share": {'$ne': "1"},
        "year": {"$gte": "1945"},
    }}}

ratio = db.laureates.count_documents(unshared) / db.laureates.count_documents(shared)
print(ratio)

1.3653846153846154


# Filter with Regular Expressions

Operator $regex 

Using "$option" : "i", ensure case sensitive matching

Or use bson Regex class 

Reminder: in regular expression ^ to specify the beginning, $ for the end and \ to escape.

In [47]:
db.laureates.distinct("bornCountry", {"bornCountry": {"$regex": "Poland"}})

['Russian Empire (now Poland)',
 'Prussia (now Poland)',
 'Germany (now Poland)',
 'Austria-Hungary (now Poland)',
 'German-occupied Poland (now Poland)',
 'Poland',
 'Poland (now Ukraine)',
 'Poland (now Lithuania)',
 'Poland (now Belarus)',
 'Free City of Danzig (now Poland)']

In [48]:
case_sensitive = db.laureates.distinct(
    "bornCountry",
    {"bornCountry": {"$regex": "Poland"}})

case_insensitive = db.laureates.distinct(
    "bornCountry",
    {"bornCountry": {"$regex": "poland", "$options": "i"}})

assert set(case_sensitive) == set(case_insensitive)

In [49]:
from bson.regex import Regex

db.laureates.distinct("bornCountry", {"bornCountry": Regex("poland", "i")})

['Russian Empire (now Poland)',
 'Prussia (now Poland)',
 'Germany (now Poland)',
 'Austria-Hungary (now Poland)',
 'German-occupied Poland (now Poland)',
 'Poland',
 'Poland (now Ukraine)',
 'Poland (now Lithuania)',
 'Poland (now Belarus)',
 'Free City of Danzig (now Poland)']

In [50]:
from bson.regex import Regex

# Fill in a string value to be sandwiched between the strings "^" and "now"
criteria = {"bornCountry": Regex("^" + "Germany \\(" + "now")}
print(set(db.laureates.distinct("bornCountry", criteria)))

{'Germany (now Poland)', 'Germany (now Russia)', 'Germany (now France)'}


In [56]:
from bson.regex import Regex

# Save a filter for laureates with prize motivation values containing "transistor" as a substring
criteria = {"prizes.motivation": Regex("transistor")}

# Save the field names corresponding to a laureate's first name and last name
first, last = "firstname","surname"
print([(laureate[first], laureate[last]) for laureate in db.laureates.find(criteria)])



[('William Bradford', 'Shockley'), ('John', 'Bardeen'), ('Walter Houser', 'Brattain')]


# MongoDB Projection

- Fetch only the fields, which you are interested in. Because it only fetch part of the data, the operation is fast
- "1" for the field you want and 0 for the field do not want.
- Return an iterable. 
- return object id  ("_id") by default unless specified 

db.collection.find({filter},[projectionfield1,projectionfield2]}
db.collection.find({filter},{projectionfield1:"1",projectionfield2}}

save a list names of full names ("firstname" plus "surname") of laureates with initials G.S. (ignoring middles names/initials). 

In [57]:
# Collect a list of full names
names = [" ".join([doc['firstname'], doc['surname']])
         for doc in db.laureates.find(
             {"firstname": {"$regex": "^G"},
              "surname": {"$regex": "^S"}},
             ["firstname", "surname"])]
print(names)

['Glenn Theodore Seaborg', 'George D. Snell', 'Gustav Stresemann', 'George Bernard Shaw', 'Giorgos Seferis', 'George J. Stigler', 'George F. Smoot', 'George E. Smith', 'George P. Smith']


# Sort result 

If the dataset is small, post query sort is sufficient. If the data is large, then sort on the server before display.



## Post query sort 

In [58]:
docs = list(db.prizes.find({"category": "physics"}, ["year"]))

print([doc["year"] for doc in docs][:5])

['2018', '2017', '2016', '2015', '2014']


In [60]:
#sort data in chronological order 
from operator import itemgetter

#itemgetter fetch the value of key from the dictionary 
docs = sorted(docs, key=itemgetter("year"), reverse=False)
print([doc["year"] for doc in docs][:5])

['1901', '1902', '1903', '1904', '1905']


In [55]:
db.laureates.find(criteria)

<pymongo.cursor.Cursor at 0x1cb5eb7a1d0>

## In-query sort

- sort take a tuple `sort = [('key',1)]`
- Ascending order use 1 and descending order use -1 
- Sort can take a list of keyword arguments. The sort field does not have to be the contained in the projected fields

In [61]:
cursor = db.prizes.find({"category": "physics"}, ["year"],
                        sort=[("year", 1)])
print([doc["year"] for doc in cursor][:5])

['1901', '1902', '1903', '1904', '1905']


In [64]:
# sort by a list of keywords

for doc in db.prizes.find(
    # look for data between 1966 - 1970 
        {"year": {"$gt": "1966", "$lt": "1970"}},
    # projecting category and year columns 
        ["category", "year"],
    #sort by a list 
        sort=[("year", 1), ("category", -1)]):
    # unpack using ** syntax
    print("{year} {category}".format(**doc))

1967 physics
1967 medicine
1967 literature
1967 chemistry
1968 physics
1968 peace
1968 medicine
1968 literature
1968 chemistry
1969 physics
1969 peace
1969 medicine
1969 literature
1969 economics
1969 chemistry


Print out the names of all physics laureates, with one line printed for each award year, in chronological order. Each line will list laureates for that year in alphabetical order by surname ("last" name).

In [65]:
from operator import itemgetter

# Sort by ascending year
sort_spec = [("year", 1)]

# Construct a cursor over physics prizes
cursor = db.prizes.find({"category": "physics"}, ["year", "laureates.firstname", "laureates.surname"], sort=sort_spec)
docs = list(cursor)
for doc in sorted(docs, key=itemgetter("year")):
    print("{year}: {first_laureate_surname}".format(
        year=doc["year"], first_laureate_surname=doc["laureates"][0]["surname"]))
cursor.rewind() # Rewind cursor to reuse in the next step

# Define a function names() to return a list of formatted names
def names(doc):
    formatted_names = ["{firstname} {surname}".format(**laureate)
           # sort by surname, but return laureate field            
          for laureate in sorted(doc["laureates"], key=itemgetter('surname'))]
    return formatted_names

lines = ["{year}: {names}".format(year=doc["year"], names=" and ".join(names(doc)))
         for doc in cursor]
for line in lines: print(line)


1901: Röntgen
1902: Lorentz
1903: Becquerel
1904: (John William Strutt)
1905: von Lenard
1906: Thomson
1907: Michelson
1908: Lippmann
1909: Marconi
1910: van der Waals
1911: Wien
1912: Dalén
1913: Kamerlingh Onnes
1914: von Laue
1915: Bragg
1917: Barkla
1918: Planck
1919: Stark
1920: Guillaume
1921: Einstein
1922: Bohr
1923: Millikan
1924: Siegbahn
1925: Franck
1926: Perrin
1927: Compton
1928: Richardson
1929: de Broglie
1930: Raman
1932: Heisenberg
1933: Schrödinger
1935: Chadwick
1936: Hess
1937: Davisson
1938: Fermi
1939: Lawrence
1943: Stern
1944: Rabi
1945: Pauli
1946: Bridgman
1947: Appleton
1948: Blackett
1949: Yukawa
1950: Powell
1951: Cockcroft
1952: Bloch
1953: Zernike
1954: Born
1955: Lamb
1956: Shockley
1957: Yang
1958: Cherenkov
1959: Segrè
1960: Glaser
1961: Hofstadter
1962: Landau
1963: Wigner
1964: Townes
1965: Tomonaga
1966: Kastler
1967: Bethe
1968: Alvarez
1969: Gell-Mann
1970: Alfvén
1971: Gabor
1972: Bardeen
1973: Esaki
1974: Ryle
1975: Bohr
1976: Richter
1977: And

1. Sorting first by reverse chronological order and second by alphabetical order of category, collect and format prize documents to produce one formatted entry per year listing categories missing for that year.

2. Use <collection>.find to construct a cursor cursor that yields prize documents only for categories in the list of original categories, sorted first by decreasing year and second by increasing category.
    
 3. Collect a list not_awarded of entries to be printed, one per line, that display a year and the categories missing for that year. You will collect "category" values for each year and set-subtract them from the original categories.

In [67]:
import itertools
from operator import itemgetter

# Save the set of prize categories awarded in 1901
original_categories = set(db.prizes.distinct("category", {"year": "1901"}))

# Construct a cursor over original-category prizes
cursor = db.prizes.find({"category": {"$in": list(original_categories)}}, ["category", "year"],
                        sort=[("year", -1), ("category", 1)])

# Collect entries for missing prize categories
not_awarded = []
for key, group in itertools.groupby(cursor, key=itemgetter("year")):
    year_categories = set(prize['category'] for prize in group)
    missing = ", ".join(sorted(original_categories - year_categories))
    if missing: not_awarded.append("{}: {}".format(key, missing))

for line in not_awarded: print(line)

2018: literature
1972: peace
1967: peace
1966: peace
1956: peace
1955: peace
1948: peace
1943: literature, peace
1939: peace
1935: literature
1934: physics
1933: chemistry
1932: peace
1931: physics
1928: peace
1925: medicine
1924: chemistry, peace
1923: peace
1921: medicine
1919: chemistry
1918: literature, medicine, peace
1917: chemistry, medicine
1916: chemistry, medicine, peace, physics
1915: medicine, peace
1914: literature, peace


# Indexes in MongoDB

Help speed the look up. Index field using the value of those fields. Analogy with indexes of a book. Each collection is a book. Each document as a page, and each field is a type of content.

Indexes is useful when:
- expected a few documents back
- large collections 

Index covers a query projection
`information` ask if an index exist 
This command find all existing indexes
db.laureates.index_information() 

`explain()` details how a given query will execute
db.laureates.find(
    {"firstname": "Marie"}, {"bornCountry": 1, "_id": 0}).explain()

For each prize category, report the most recent year that a single laureate -- rather than several -- received a prize in that category.

In [68]:
# Specify an index model for compound sorting
index_model = [("category", 1), ("year", -1)]
db.prizes.create_index(index_model)

# Collect the last single-laureate year for each category
report = ""
for category in sorted(db.prizes.distinct("category")):
    doc = db.prizes.find_one(
        {"category": category, "laureates.share": "1"},
        sort=[("year", -1)]
    )
    report += "{category}: {year}\n".format(**doc)

print(report)

chemistry: 2011
economics: 2017
literature: 2017
medicine: 2016
peace: 2017
physics: 1992



Find the five countries of birth with the highest counts of  laureates with both their country of birth ("bornCountry") and a country of affiliation the same.

In [69]:
from collections import Counter

# Ensure an index on country of birth
db.laureates.create_index([("bornCountry", 1)])

# Collect a count of laureates for each country of birth
n_born_and_affiliated = {
    country: db.laureates.count_documents({
        'bornCountry': country,
        "prizes.affiliations.country": country
    })
    for country in db.laureates.distinct("bornCountry")
}

five_most_common = Counter(n_born_and_affiliated).most_common(5)
print(five_most_common)

[('USA', 241), ('United Kingdom', 56), ('France', 26), ('Germany', 19), ('Japan', 17)]


# Limits and Skips with Sorts

help search documents with extreme values

## Limit the search

In [71]:
#verify that all prizes have a 1/3 shares  or none at all
for doc in db.prizes.find({}, ["laureates.share"]):
    share_is_three = [laureate["share"] == "3"
                      for laureate in doc["laureates"]]
    assert all(share_is_three) or not any(share_is_three)

In [77]:
# print info on prizes split three ways and limit the output to 3
for doc in db.prizes.find({"laureates.share": "3"}, limit=10):
    print("{year} {category}".format(**doc))

2017 chemistry
2017 medicine
2016 chemistry
2015 chemistry
2014 physics
2014 chemistry
2013 chemistry
2013 medicine
2013 economics
2011 peace


In [78]:
# print info on prizes split three ways 
# limit the output to 3 and skip the first 3 record
for doc in db.prizes.find({"laureates.share": "3"}, skip=3,limit=3):
    print("{year} {category}".format(**doc))

2015 chemistry
2014 physics
2014 chemistry


In [76]:
#another way to limit is to chain the method
for doc in db.prizes.find({"laureates.share": "3"}).limit(3).skip(3):
    print("{year} {category}".format(**doc))

2015 chemistry
2014 physics
2014 chemistry


Limit with sort

In [80]:
cursor2 = (db.prizes.find({"laureates.share": "3"}).skip(3).limit(3)
          .sort("year", 1))

docs = list(cursor2)
for doc in docs:
    print("{year} {category}".format(**doc))

1954 medicine
1956 physics
1956 medicine


In [81]:
from pprint import pprint

# Fetch prizes with quarter-share laureate(s)
filter_ = {"laureates.share": "4"}

# Save the list of field names
projection = ['category', 'year', "laureates.motivation"]

# Save a cursor to yield the first five prizes
cursor = db.prizes.find(filter_, projection).sort("year").limit(5)
pprint(list(cursor))

[{'_id': ObjectId('5c9fd8a604ac58346c84e93d'),
  'category': 'physics',
  'laureates': [{'motivation': '"in recognition of the extraordinary services '
                               'he has rendered by his discovery of '
                               'spontaneous radioactivity"'},
                {'motivation': '"in recognition of the extraordinary services '
                               'they have rendered by their joint researches '
                               'on the radiation phenomena discovered by '
                               'Professor Henri Becquerel"'},
                {'motivation': '"in recognition of the extraordinary services '
                               'they have rendered by their joint researches '
                               'on the radiation phenomena discovered by '
                               'Professor Henri Becquerel"'}],
  'year': '1903'},
 {'_id': ObjectId('5c9fd8a604ac58346c84e892'),
  'category': 'chemistry',
  'laureates': [{'motivation':

Want to present these laureates one page at a time, with three laureates per page

In [82]:
from pprint import pprint

# Write a function to retrieve a page of data
def get_particle_laureates(page_number=1, page_size=3):
    if page_number < 1 or not isinstance(page_number, int):
        raise ValueError("Pages are natural numbers (starting from 1).")
    particle_laureates = list(
        db.laureates.find(
            {"prizes.motivation": {'$regex': "particle"}},
            ["firstname", "surname", "prizes"])
        .sort([('prizes.year', 1), ("surname", 1)])
        .skip(page_size * (page_number - 1))
        .limit(page_size))
    return particle_laureates

# Collect and save the first nine pages
pages = [get_particle_laureates(page_number=page) for page in range(1,9)]
pprint(pages[0])

[{'_id': ObjectId('5c9fd8a804ac58346c84e96c'),
  'firstname': 'Charles Thomson Rees',
  'prizes': [{'affiliations': [{'city': 'Cambridge',
                                'country': 'United Kingdom',
                                'name': 'University of Cambridge'}],
              'category': 'physics',
              'motivation': '"for his method of making the paths of '
                            'electrically charged particles visible by '
                            'condensation of vapour"',
              'share': '2',
              'year': '1927'}],
  'surname': 'Wilson'},
 {'_id': ObjectId('5c9fd8a804ac58346c84e982'),
  'firstname': 'Sir John Douglas',
  'prizes': [{'affiliations': [{'city': 'Harwell, Berkshire',
                                'country': 'United Kingdom',
                                'name': 'Atomic Energy Research '
                                        'Establishment'}],
              'category': 'physics',
              'motivation': '"for their pione

# Aggregation on the server

aggegration pipeline looks like this 

cursor = db.laureates.aggregate([
  stage_1,
  stage_2,
  ...
])

In [83]:
cursor = db.laureates.find(
  filter={"bornCountry": "USA"},
  projection={"prizes.year": 1},
  limit=3
)

for doc in cursor:
  print(doc["prizes"])

[{'year': '1923'}]
[{'year': '1927'}]
[{'year': '1936'}]


In [84]:
cursor = db.laureates.aggregate([
  {"$match": {"bornCountry": "USA"}},
  {"$project": {"prizes.year": 1}},
  {"$limit": 3}
])
for doc in cursor:
  print(doc["prizes"])

[{'year': '1923'}]
[{'year': '1927'}]
[{'year': '1936'}]


In [86]:
from collections import OrderedDict

list(db.laureates.aggregate([
    {"$match": {"bornCountry": "USA"}},
    {"$project": {"prizes.year": 1, "_id": 0}},
    {"$sort": OrderedDict([("prizes.year", 1)])},
    {"$skip": 1},
    {"$limit": 3}
]))

[{'prizes': [{'year': '1912'}]},
 {'prizes': [{'year': '1914'}]},
 {'prizes': [{'year': '1919'}]}]

to count

In [87]:
list(db.laureates.aggregate([
    {"$match": {"bornCountry": "USA"}},
    {"$count": "n_USA-born-laureates"}
]))

[{'n_USA-born-laureates': 269}]

Find an equivalent aggregation pipeline with the below cursor command

In [89]:
cursor = (db.laureates.find(
    {"gender": {"$ne": "org"}},
    ["bornCountry", "prizes.affiliations.country"]
).limit(3))

for doc in cursor:
    print("{bornCountry}: {prizes}".format(**doc))

Prussia (now Germany): [{'affiliations': [{'country': 'Germany'}]}]
the Netherlands: [{'affiliations': [{'country': 'the Netherlands'}]}]
the Netherlands: [{'affiliations': [{'country': 'the Netherlands'}]}]


In [90]:
# the equivalent pipeline is 
# Translate cursor to aggregation pipeline
pipeline = [
    {"$match": {"gender": {"$ne": "org"}}},
    {"$project": {"bornCountry": 1, "prizes.affiliations.country": 1}},
    {"$limit": 3}
]

for doc in db.laureates.aggregate(pipeline):
    print("{bornCountry}: {prizes}".format(**doc))


Prussia (now Germany): [{'affiliations': [{'country': 'Germany'}]}]
the Netherlands: [{'affiliations': [{'country': 'the Netherlands'}]}]
the Netherlands: [{'affiliations': [{'country': 'the Netherlands'}]}]


Construct an aggregation pipeline to collect, in reverse chronological order (i.e., descending year), prize documents for all original categories (that is, $in categories awarded in 1901). Project only the prize year and category (including document _id is fine).

The aggregation cursor will be fed to Python's itertools.groupby function to group prizes by year. For each year that at least one of the original prize categories was missing, a line with all missing categories for that year will be printed.

In [91]:
from collections import OrderedDict
from itertools import groupby
from operator import itemgetter

original_categories = set(db.prizes.distinct("category", {"year": "1901"}))

# Save an pipeline to collect original-category prizes
pipeline = [
    {"$match": {'category': {"$in": sorted(original_categories)}}},
    {"$project": {"year": 1, "category": 1}},
    {"$sort": OrderedDict([("year", -1)])}
]
cursor = db.prizes.aggregate(pipeline)
for key, group in groupby(cursor, key=itemgetter("year")):
    missing = original_categories - {doc["category"] for doc in group}
    if missing:
        print("{year}: {missing}".format(year=key, missing=", ".join(sorted(missing))))

2018: literature
1972: peace
1967: peace
1966: peace
1956: peace
1955: peace
1948: peace
1943: literature, peace
1939: peace
1935: literature
1934: physics
1933: chemistry
1932: peace
1931: physics
1928: peace
1925: medicine
1924: chemistry, peace
1923: peace
1921: medicine
1919: chemistry
1918: literature, medicine, peace
1917: chemistry, medicine
1916: chemistry, medicine, peace, physics
1915: medicine, peace
1914: literature, peace


Field paths 
* expression object = {field1:<expression>,...}

in the example below, an expression object to a project stage. The object has one key, "prizes.share" with expression value 1  

In [92]:
db.laureates.aggregate([
    {"$project": {"prizes.share": 1}}
]).next()

{'_id': ObjectId('5c9fd8a804ac58346c84e94c'), 'prizes': [{'share': '1'}]}

Here we project the field called n_prizes. The field takes the value of the expression `"$size"` maps to $prize
* `$prizes` is a field path. It takes the value of the prizes field for each document processed at that stage by the pipeline.
*  `"$size": "$prizes"` is an operator expression. Treats an operator as a function. The expression applies the size operator to one ore more arguments and returns a value. Here it takes `$prizes` as an argument, and assign the size to `$n_prizes`.

The operator expression can take a list thus `["$prizes"]` will give the same result.

Field in MongoDB is preceed by $ sign 


In [93]:
db.laureates.aggregate([
    {"$project": {"n_prizes": {"$size": "$prizes"}}}
]).next()

{'_id': ObjectId('5c9fd8a804ac58346c84e94c'), 'n_prizes': 1}

Translate distint() collection method to aggegation

In [97]:
list_1 = db.laureates.distinct("bornCountry")
# each output document will have _id as a distinct value of bornCountry 
list_2 = [doc["_id"] for doc in db.laureates.aggregate([
    {"$group": {"_id": "$bornCountry"}}
])]

set(list_2) == set(list_1)

True

In [98]:
set(list_2)

{'Argentina',
 'Australia',
 'Austria',
 'Austria-Hungary (now Austria)',
 'Austria-Hungary (now Bosnia and Herzegovina)',
 'Austria-Hungary (now Croatia)',
 'Austria-Hungary (now Czech Republic)',
 'Austria-Hungary (now Hungary)',
 'Austria-Hungary (now Poland)',
 'Austria-Hungary (now Slovenia)',
 'Austria-Hungary (now Ukraine)',
 'Austrian Empire (now Austria)',
 'Austrian Empire (now Czech Republic)',
 'Austrian Empire (now Italy)',
 'Bavaria (now Germany)',
 'Belgium',
 'Bosnia (now Bosnia and Herzegovina)',
 'Brazil',
 'British India (now Bangladesh)',
 'British India (now India)',
 'British Mandate of Palestine (now Israel)',
 'British Protectorate of Palestine (now Israel)',
 'British West Indies (now Saint Lucia)',
 'Bulgaria',
 'Burma (now Myanmar)',
 'Canada',
 'Chile',
 'China',
 'Colombia',
 'Costa Rica',
 'Crete (now Greece)',
 'Cyprus',
 'Czechoslovakia (now Czech Republic)',
 'Denmark',
 'East Friesland (now Germany)',
 'East Timor',
 'Egypt',
 'Faroe Islands (Denmark)'

How many prizes have been awarded in total? 

In [99]:
list(db.laureates.aggregate([
    {"$project": {"n_prizes": {"$size": "$prizes"}}},
    {"$group": {"_id": None, "n_prizes_total": {"$sum": "$n_prizes"}}}
]))

[{'_id': None, 'n_prizes_total': 941}]

How many prizes were awarded (at least partly) to organizations?

In [100]:
# Count prizes awarded (at least partly) to organizations as a sum over sizes of "prizes" arrays.
pipeline = [
    {"$match": {"gender": "org"}},
    {"$project": {"n_prizes": {"$size": "$prizes"}}},
    {"$group": {"_id": None, "n_prizes_total": {"$sum": "$n_prizes"}}}
]

print(list(db.laureates.aggregate(pipeline)))

[{'_id': None, 'n_prizes_total': 27}]


Implement an aggregation pipeline that:

Filters for original prize categories (i.e. sans economics),
Projects category and year,
Groups distinct prize categories awarded by year,
Projects prize categories not awarded by year,
Filters for years with missing prize categories, and
Returns a cursor of documents in reverse chronological order, one per year, each with a list of missing prize categories for that year.

In [101]:
from collections import OrderedDict

original_categories = sorted(set(db.prizes.distinct("category", {"year": "1901"})))
pipeline = [
    {"$match": {"category": {"$in": original_categories}}},
    {"$project": {"category": 1, "year": 1}},
    
    # Collect the set of category values for each prize year.
    {"$group": {"_id": "$year", "categories": {"$addToSet": "$category"}}},
    
    # Project categories *not* awarded (i.e., that are missing this year).
    {"$project": {"missing": {"$setDifference": [original_categories, "$categories"]}}},
    
    # Only include years with at least one missing category
    {"$match": {"missing.0": {"$exists": True}}},
    
    # Sort in reverse chronological order. Note that "_id" is a distinct year at this stage.
    {"$sort": OrderedDict([("_id", -1)])},
]
for doc in db.prizes.aggregate(pipeline):
    print("{year}: {missing}".format(year=doc["_id"],missing=", ".join(sorted(doc["missing"]))))

2018: literature
1972: peace
1967: peace
1966: peace
1956: peace
1955: peace
1948: peace
1943: literature, peace
1939: peace
1935: literature
1934: physics
1933: chemistry
1932: peace
1931: physics
1928: peace
1925: medicine
1924: chemistry, peace
1923: peace
1921: medicine
1919: chemistry
1918: literature, medicine, peace
1917: chemistry, medicine
1916: chemistry, medicine, peace, physics
1915: medicine, peace
1914: literature, peace


Using $unwind to zoom into array fields

Want the number of laureates for each prize.

In [102]:
list(db.prizes.aggregate([
    # project a field using $size 
    {"$project": {"n_laureates": {"$size": "$laureates"},
                  "category": 1}},
    # group by category producing count of laureate per category 
    # producing a count of laureates per catergory
    # reset the field n_laureates to be the sume of n_laureates per category
    {"$group": {"_id": "$category", "n_laureates": {"$sum": "$n_laureates"}}},
    # sort by descending count
    {"$sort": {"n_laureates": -1}},
]))

[{'_id': 'medicine', 'n_laureates': 216},
 {'_id': 'physics', 'n_laureates': 210},
 {'_id': 'chemistry', 'n_laureates': 181},
 {'_id': 'peace', 'n_laureates': 133},
 {'_id': 'literature', 'n_laureates': 114},
 {'_id': 'economics', 'n_laureates': 81}]

$unwind output one document per array document then recompress the stage

These two pipeline produce the same result

In [103]:
list(db.prizes.aggregate([
    {"$unwind": "$laureates"},
    {"$project": {
        "_id": 0, "year": 1, "category": 1,
        "laureates.surname": 1, "laureates.share": 1}},
    {"$limit": 3}
]))

[{'year': '2018',
  'category': 'physics',
  'laureates': {'surname': 'Ashkin', 'share': '2'}},
 {'year': '2018',
  'category': 'physics',
  'laureates': {'surname': 'Mourou', 'share': '4'}},
 {'year': '2018',
  'category': 'physics',
  'laureates': {'surname': 'Strickland', 'share': '4'}}]

Want to track only the laureates id for each prize 

In [104]:
list(db.prizes.aggregate([
    {"$unwind": "$laureates"},
    {"$project": {"year": 1, "category": 1, "laureates.id": 1}},
    # year and category together identify a prize, so group by
    # a concat of those value
    {"$group": {"_id": {"$concat": ["$category", ":", "$year"]},
                "laureate_ids": {"$addToSet": "$laureates.id"}}},
    {"$limit": 5}
]))

[{'_id': 'literature:1901', 'laureate_ids': ['569']},
 {'_id': 'medicine:1901', 'laureate_ids': ['293']},
 {'_id': 'chemistry:1901', 'laureate_ids': ['160']},
 {'_id': 'physics:1901', 'laureate_ids': ['1']},
 {'_id': 'peace:1902', 'laureate_ids': ['465', '464']}]

`$lookup` pull from another collection via left outer join. THis command is often used with `$unwind`.

The code below collect country of birth for economics laureates.

In [105]:
list(db.prizes.aggregate([
    {"$match": {"category": "economics"}},
    # unwind the laureates array each pipeline document now has a single laureates.id
    {"$unwind": "$laureates"},
    # query the lureat collection for documents with the same id value
    {"$lookup": {"from": "laureates", "foreignField": "id",
                # put those documents into laureate_bios
                 "localField": "laureates.id", "as": "laureate_bios"}},
     
    {"$unwind": "$laureate_bios"},
    # collect distinct laureat bornCountry values
    {"$group": {"_id": None,
                "bornCountries": {"$addToSet": "$laureate_bios.bornCountry"}
    }},
]))

[{'_id': None,
  'bornCountries': ['Russian Empire (now Belarus)',
   'the Netherlands',
   'Russian Empire (now Russia)',
   'Sweden',
   'Germany (now Poland)',
   'India',
   'Norway',
   'British West Indies (now Saint Lucia)',
   'Italy',
   'Germany',
   'Russia',
   'Cyprus',
   'Canada',
   'Austria',
   'Scotland',
   'British Mandate of Palestine (now Israel)',
   'Finland',
   'Hungary',
   'France',
   'USA',
   'United Kingdom']}]

In [107]:
#alternate code

bornCountries = db.laureates.distinct(
    "bornCountry", {"prizes.category": "economics"})
set(bornCountries)

{'Austria',
 'British Mandate of Palestine (now Israel)',
 'British West Indies (now Saint Lucia)',
 'Canada',
 'Cyprus',
 'Finland',
 'France',
 'Germany',
 'Germany (now Poland)',
 'Hungary',
 'India',
 'Italy',
 'Norway',
 'Russia',
 'Russian Empire (now Belarus)',
 'Russian Empire (now Russia)',
 'Scotland',
 'Sweden',
 'USA',
 'United Kingdom',
 'the Netherlands'}

Build an aggregation pipeline to get the count of laureates who either did or did not win a prize with an affiliation country that is a substring of their country of birth -- for example, the prize affiliation country "Germany" should match the country of birth "Prussia (now Germany)".

In [108]:
key_ac = "prizes.affiliations.country"
key_bc = "bornCountry"
pipeline = [
    {"$project": {key_bc: 1, key_ac: 1}},

    # Ensure a single prize affiliation country per pipeline document
    {"$unwind": "$prizes"},
    {"$unwind": "$prizes.affiliations"},

    # Ensure values in the list of distinct values (so not empty)
    {"$match": {key_ac: {"$in": db.laureates.distinct(key_ac)}}},
    {"$project": {"affilCountrySameAsBorn": {
        "$gte": [{"$indexOfBytes": ["$"+key_ac, "$"+key_bc]}, 0]}}},

    # Count by "$affilCountrySameAsBorn" value (True or False)
    {"$group": {"_id": "$affilCountrySameAsBorn",
                "count": {"$sum": 1}}},
]
for doc in db.laureates.aggregate(pipeline): print(doc)

{'_id': True, 'count': 477}
{'_id': False, 'count': 261}


build an aggregation pipeline for the prizes collection to collect these numbers, using a $lookup stage to obtain laureate countries of birth.

In [109]:
pipeline = [
    # Unwind the laureates array
    {"$unwind": "$laureates"},
    {"$lookup": {
        "from": "laureates", "foreignField": "id",
        "localField": "laureates.id", "as": "laureate_bios"}},

    # Unwind the new laureate_bios array
    {"$unwind": "$laureate_bios"},
    {"$project": {"category": 1,
                  "bornCountry": "$laureate_bios.bornCountry"}},

    # Collect bornCountry values associated with each prize category
    {"$group": {"_id": "$category",
                "bornCountries": {"$addToSet": "$bornCountry"}}},

    # Project out the size of each category's (set of) bornCountries
    {"$project": {"category": 1,
                  "nBornCountries": {"$size": "$bornCountries"}}},
    {"$sort": {"nBornCountries": -1}},
]
for doc in db.prizes.aggregate(pipeline): print(doc)

{'_id': 'literature', 'nBornCountries': 55}
{'_id': 'peace', 'nBornCountries': 50}
{'_id': 'chemistry', 'nBornCountries': 48}
{'_id': 'medicine', 'nBornCountries': 44}
{'_id': 'physics', 'nBornCountries': 44}
{'_id': 'economics', 'nBornCountries': 21}


`$addFields` commands 

Want to know the number of year a laureates was alive. One way to do it is to project the died and born field, but this will not work because some laureates does not have the died and born fields. Because some laureates are still alive.

To fix this use add field stage to provide new array fields.

In [None]:
# this code does not work 
docs = list(db.laureates.aggregate([
    {"$project": {"died": {"$dateFromString": {"dateString": "$died"}},
                  "born": {"$dateFromString": {"dateString": "$born"}}}}
]))

In [111]:
docs = list(db.laureates.aggregate([
    {"$match": {"died": {"$gt": "1700"}, "born": {"$gt": "1700"}}},
    {"$addFields": {"bornArray": {"$split": ["$born", "-"]},
                    "diedArray": {"$split": ["$died", "-"]}}},
    # if the value is missing add the value 
     {"$addFields": {"born": {"$cond": [
        {"$in": ["00", "$bornArray"]},
        {"$concat": [{"$arrayElemAt": ["$bornArray", 0]}, "-01-01"]},
        "$born"
    ]}}},
    
  {"$project": {"died": {"$dateFromString": {"dateString": "$died"}},
                  "born": {"$dateFromString": {"dateString": "$born"}},
                  "_id": 0}}
]))
print(docs[0])
   

{'died': datetime.datetime(1923, 2, 10, 0, 0), 'born': datetime.datetime(1845, 3, 27, 0, 0)}


Compute the number of year between the died and born dates. `$bucket` groups values into buckets defined by a sequence of boundaries.

In [113]:
docs = list(db.laureates.aggregate([
        {"$match": {"died": {"$gt": "1700"}, "born": {"$gt": "1700"}}},
    {"$addFields": {"bornArray": {"$split": ["$born", "-"]},
                    "diedArray": {"$split": ["$died", "-"]}}},
    # if the value is missing add the value 
     {"$addFields": {"born": {"$cond": [
        {"$in": ["00", "$bornArray"]},
        {"$concat": [{"$arrayElemAt": ["$bornArray", 0]}, "-01-01"]},
        "$born"
    ]}}},
    
    {"$project": {"died": {"$dateFromString": {"dateString": "$died"}},
                  "born": {"$dateFromString": {"dateString": "$born"}}}},
    {"$project": {"years": {"$floor": {"$divide": [
        {"$subtract": ["$died", "$born"]},
        31557600000 # 1000 * 60 * 60 * 24 * 365.25
    ]}}}},
    
     {"$bucket": {"groupBy": "$years",
                 "boundaries": list(range(30, 120, 10))}}
]))

for doc in docs: print(doc)

{'_id': 30, 'count': 1}
{'_id': 40, 'count': 6}
{'_id': 50, 'count': 21}
{'_id': 60, 'count': 87}
{'_id': 70, 'count': 154}
{'_id': 80, 'count': 221}
{'_id': 90, 'count': 115}
{'_id': 100, 'count': 2}


How many prizes were awarded to people who had no affiliation in their country of birth at the time of the award?

In [114]:
pipeline = [
    # Limit results to people; project needed fields; unwind prizes
    {"$match": {"gender": {"$ne": "org"}}},
    {"$project": {"bornCountry": 1, "prizes.affiliations.country": 1}},
    {"$unwind": "$prizes"},
  
    # Count prizes with no country-of-birth affiliation
    {"$addFields": {"bornCountryInAffiliations": {"$in": ["$bornCountry", "$prizes.affiliations.country"]}}},
    {"$match": {"bornCountryInAffiliations": False}},
    {"$count": "awardedElsewhere"},
]

print(list(db.laureates.aggregate(pipeline)))

[{'awardedElsewhere': 478}]


filter out "unaffiliated" people 

hundreds of prizes were awarded to people without recorded affiliations; sure, their "bornCountry" is technically not the "country" of any of their affiliations, but there are no "country" values to compare against.

This is achieve by addition more filter 

In [116]:
pipeline = [
    {"$match": {"gender": {"$ne": "org"}}},
    {"$project": {"bornCountry": 1, "prizes.affiliations.country": 1}},
    {"$unwind": "$prizes"},
    {"$addFields": {"bornCountryInAffiliations": {"$in": ["$bornCountry", "$prizes.affiliations.country"]}}},
    {"$match": {"bornCountryInAffiliations": False}},
    {"$count": "awardedElsewhere"},
]

# Construct the additional filter stage
added_stage = {"$match": {'prizes.affiliations.country': {"$in": db.laureates.distinct('prizes.affiliations.country')}}}

# Insert this stage into the pipeline
pipeline.insert(3, added_stage)
print(list(db.laureates.aggregate(pipeline)))

[{'awardedElsewhere': 252}]
