In [7]:
#!pip install musicbrainzngs
#!pip3 install pymongo

In [8]:
import os
import time
import pprint
import datetime
from pymongo import MongoClient, IndexModel, ASCENDING, DESCENDING
import musicbrainzngs
from musicbrainzngs import *

In [9]:
# Config musicbrainz
# https://musicbrainz.org/doc/Development/XML_Web_Service/Version_1

musicbrainzngs.set_useragent(app="coffeecode", version=1, contact=None)

In [10]:
# Connect to MongoDB

client = MongoClient('localhost', 27017)

In [11]:
# Create db

db = client.jazz_catalog

In [12]:
# Create three collections
# Note that due to network limitations we will use only a subset of available Musicbrainz metadata

db.drop_collection('artists')
artists = db.artists

db.drop_collection('releases')
releases = db.releases

In [13]:
# search for an artist

seedArtist = musicbrainzngs.search_artists("miles davis")

In [14]:
for artist in seedArtist['artist-list']:
    print("{name}, {id}".format(name=artist['name'], id=artist['id']))

Miles Davis, 561d854a-6a28-4aa7-8c99-323e6ce46c2a
Miles Davis Quintet, fe7245e7-d734-4ca1-8e26-691883f58201
The Miles Davis Nonet, 16d2b8e6-8930-4c9e-856f-e6d096479350
Miles Davis Sextet, f137837b-fa55-48d1-8c85-f13961b3ac77
Miles Davis Quartet, 03606dee-b333-48fe-b9af-d671c837d40f
Miles Davis Septet, 88130878-7ee9-4ce2-b24b-1dd9da898030
Miles Davis Moody, d74d6350-a042-4202-b4fd-2a05918a06be
Miles Davis All Stars, 607d7275-2b92-49e5-a7ee-32f9cc29b076
Miles Davis + 19, 72f2e1fe-dc92-4011-93d7-2fb6aff4a2bd
The Miles Davis / Tadd Dameron Quintet, 7103c200-fd5c-4890-8685-5328a055f7ba
Miles!, f2b73b53-3414-4883-b014-622ab6917215
Miles Davis and His Orchestra, c52ff92e-b338-4815-bd63-4e9b89795e84
Miles Davis All Star Sextet, 47c342e2-5c5f-4b5a-b919-8e0ffc329272
The Shoes Of Miles Davis, addd7107-3955-4461-a62c-0b346feb5450
Miles Davis & The Modern Jazz Giants, e1b6e38e-ab75-43fc-b994-bfeea9bc7626
Miles Davis and the Lighthouse All-Stars, 4cd1a02d-9645-4172-8d45-e468e097c23b
Conrad Roberts, 

In [15]:
# Create indexes for mongodb collections
# on ID field, enforce uniqueness constraint

db.artists.create_index('id', name='artist_id', unique=True)
db.releases.create_index('id', name='release_id', unique=True)

'release_id'

In [16]:
# Get artist info for top two
# Insert into db.artists

for artist in seedArtist['artist-list'][:2]:
    mbid = artist['id']
    artist_info = musicbrainzngs.get_artist_by_id(mbid)
    pprint.pprint(artist_info)

{'artist': {'area': {'id': '489ce91b-6658-3307-9877-795b68554c98',
                     'iso-3166-1-code-list': ['US'],
                     'name': 'United States',
                     'sort-name': 'United States'},
            'begin-area': {'id': '39e1719e-2604-4b59-b698-dab7caf85b33',
                           'name': 'Alton',
                           'sort-name': 'Alton'},
            'country': 'US',
            'disambiguation': 'jazz trumpeter, bandleader, songwriter',
            'end-area': {'id': 'dbacf2e3-7e3e-4cee-8804-999b109285fa',
                         'name': 'Santa Monica',
                         'sort-name': 'Santa Monica'},
            'gender': 'Male',
            'id': '561d854a-6a28-4aa7-8c99-323e6ce46c2a',
            'ipi': '00007619785',
            'ipi-list': ['00007619785'],
            'isni-list': ['000000012144707X'],
            'life-span': {'begin': '1926-05-26',
                          'end': '1991-09-28',
                          'ended'

In [17]:
for artist in seedArtist['artist-list'][:2]:
    mbid = artist['id']
    artist_info = musicbrainzngs.get_artist_by_id(mbid)
    artists.insert_one(artist_info['artist'])

In [18]:
# Try again to check uniqueness constraint

for artist in seedArtist['artist-list'][:2]:
    mbid = artist['id']
    artist_info = musicbrainzngs.get_artist_by_id(mbid)
    try:
        artists.insert_one(artist_info['artist'])
    except Exception as e:
        print(e)

E11000 duplicate key error collection: jazz_catalog.artists index: artist_id dup key: { : "561d854a-6a28-4aa7-8c99-323e6ce46c2a" }
E11000 duplicate key error collection: jazz_catalog.artists index: artist_id dup key: { : "fe7245e7-d734-4ca1-8e26-691883f58201" }


In [19]:
db.artists.count()

  """Entry point for launching an IPython kernel.


2

In [20]:
for a in artists.find():
    pprint.pprint(a)

{'_id': ObjectId('5c086ce6c9efcc1303986738'),
 'area': {'id': '489ce91b-6658-3307-9877-795b68554c98',
          'iso-3166-1-code-list': ['US'],
          'name': 'United States',
          'sort-name': 'United States'},
 'begin-area': {'id': '39e1719e-2604-4b59-b698-dab7caf85b33',
                'name': 'Alton',
                'sort-name': 'Alton'},
 'country': 'US',
 'disambiguation': 'jazz trumpeter, bandleader, songwriter',
 'end-area': {'id': 'dbacf2e3-7e3e-4cee-8804-999b109285fa',
              'name': 'Santa Monica',
              'sort-name': 'Santa Monica'},
 'gender': 'Male',
 'id': '561d854a-6a28-4aa7-8c99-323e6ce46c2a',
 'ipi': '00007619785',
 'ipi-list': ['00007619785'],
 'isni-list': ['000000012144707X'],
 'life-span': {'begin': '1926-05-26', 'end': '1991-09-28', 'ended': 'true'},
 'name': 'Miles Davis',
 'sort-name': 'Davis, Miles',
 'type': 'Person'}
{'_id': ObjectId('5c086ce7c9efcc1303986739'),
 'area': {'id': '489ce91b-6658-3307-9877-795b68554c98',
          'iso-316

In [21]:
# Get the first set of releases with 'Official' status
# (keep result sets to default 25 to minimize network impact)
# Make a second call for complete track info and
# Insert response into mongodb releases collection

# ADD BACK 'artist-credits' include

for a in artists.find():
    artist_id = a['id']
    release_search = musicbrainzngs.get_artist_by_id(artist_id, includes=['releases',  'release-rels', 'label-rels'])
    release_list = release_search['artist']['release-list']
    for r in release_list:
        if 'status' in r.keys():
            if r['status'] == "Official":
                release_id = r['id']
                release_info = musicbrainzngs.get_release_by_id(release_id,
                                                                includes=['artist-credits', 'recordings',
                                                                          'recording-level-rels', 'artist-rels'])
                try:
                    releases.insert_one(release_info['release'])
                except Exception as e:
                    print(e)

In [22]:
db.releases.count()

  """Entry point for launching an IPython kernel.


45

In [23]:
# so now get all artists info for all tracks on all releases

artist_ids = set()
artist_count = 0

for r in releases.find():
    if 'medium-list' in r:
        media = r['medium-list']
        for m in media:
            if 'track-list' in m:
                tracks = m['track-list']
                for each in tracks:
                    if 'recording' in each.keys():
                        if 'artist-relation-list' in each['recording'].keys():
                            artist_list = each['recording']['artist-relation-list']
                            for a in artist_list:
                                artist_count += 1
                                artist_id = a['artist']['id']
                                artist_ids.add(artist_id)

In [24]:
artist_count

1455

In [25]:
len(artist_ids)

86

In [26]:
current_artist_ids = set(artists.distinct('id'))
new_artist_search_ids = artist_ids - current_artist_ids

In [27]:
len(new_artist_search_ids)

85

In [28]:
for a_id in list(new_artist_search_ids):
    artist_info = musicbrainzngs.get_artist_by_id(a_id)
    try:
        artists.insert_one(artist_info['artist'])
    except Exception as e:
        print(e)

In [29]:
artists.count()

  """Entry point for launching an IPython kernel.


87

In [30]:
# now redo release search for all new artists - there may be some duplicates

# Get the first set of releases with 'Official' status
# (keep result sets to default 25 to minimize network impact)
# Make a second call for complete track info and
# Insert response into mongodb releases collection

for a_id in list(new_artist_search_ids):
    release_search = musicbrainzngs.get_artist_by_id(a_id, includes=['releases',  'release-rels', 'label-rels'])
    release_list = release_search['artist']['release-list']
    for r in release_list:
        if 'status' in r.keys():
            if r['status'] == "Official":
                release_id = r['id']
                release_info = musicbrainzngs.get_release_by_id(release_id,
                                                                includes=['artist-credits', 'recordings', 
                                                                          'recording-level-rels', 'artist-rels'])
                try:
                    releases.insert_one(release_info['release'])
                except Exception as e:
                    print(e)

E11000 duplicate key error collection: jazz_catalog.releases index: release_id dup key: { : "09d238f8-22b0-4d52-aa73-958f393dfab9" }
E11000 duplicate key error collection: jazz_catalog.releases index: release_id dup key: { : "9158c339-0fc5-46aa-958b-8db2bc0f5c79" }
E11000 duplicate key error collection: jazz_catalog.releases index: release_id dup key: { : "2c6f3c86-0df2-4033-b709-4b4dbaacc120" }
E11000 duplicate key error collection: jazz_catalog.releases index: release_id dup key: { : "38bdd071-9f00-41f9-9b90-ea4548f10225" }
E11000 duplicate key error collection: jazz_catalog.releases index: release_id dup key: { : "1198ed14-429d-4c0e-b1af-152d54046343" }
E11000 duplicate key error collection: jazz_catalog.releases index: release_id dup key: { : "ed3b8786-015c-4880-b32a-1acf1c639ed3" }
E11000 duplicate key error collection: jazz_catalog.releases index: release_id dup key: { : "f330c2c9-ea2b-4a56-9b16-68124480f9e8" }
E11000 duplicate key error collection: jazz_catalog.releases index: r

E11000 duplicate key error collection: jazz_catalog.releases index: release_id dup key: { : "385cdb46-8645-425d-a37a-4c62553d0820" }
E11000 duplicate key error collection: jazz_catalog.releases index: release_id dup key: { : "45ca3307-42e9-48b6-95d9-03ba1d26a3cf" }
E11000 duplicate key error collection: jazz_catalog.releases index: release_id dup key: { : "70d4f763-3703-49eb-bc86-5a5b9185fafc" }
E11000 duplicate key error collection: jazz_catalog.releases index: release_id dup key: { : "a3822098-f941-4672-bfe1-e4c916224548" }
E11000 duplicate key error collection: jazz_catalog.releases index: release_id dup key: { : "b8c24108-8db1-3050-af03-875b920f01c6" }
E11000 duplicate key error collection: jazz_catalog.releases index: release_id dup key: { : "3aef34c7-0773-49d6-ba4f-9c3269b81ef6" }
E11000 duplicate key error collection: jazz_catalog.releases index: release_id dup key: { : "4114b581-7f9f-45f1-b83f-45bab0ac3ebe" }
E11000 duplicate key error collection: jazz_catalog.releases index: r

In [31]:
db.releases.count()

  """Entry point for launching an IPython kernel.


889

In [32]:
a_release = releases.find_one()

In [33]:
pprint.pprint(a_release)

{'_id': ObjectId('5c086cebc9efcc130398673c'),
 'artist-credit': [{'artist': {'disambiguation': 'jazz trumpeter, bandleader, '
                                                 'songwriter',
                               'id': '561d854a-6a28-4aa7-8c99-323e6ce46c2a',
                               'name': 'Miles Davis',
                               'sort-name': 'Davis, Miles'}}],
 'artist-credit-phrase': 'Miles Davis',
 'country': 'US',
 'cover-art-archive': {'artwork': 'true',
                       'back': 'false',
                       'count': '1',
                       'front': 'true'},
 'date': '1954',
 'id': '16ed7b47-e00c-499d-8866-65e660f4c78f',
 'medium-count': 1,
 'medium-list': [{'format': '12" Vinyl',
                  'position': '1',
                  'track-count': 6,
                  'track-list': [{'artist-credit': [{'artist': {'disambiguation': 'jazz '
                                                                                  'trumpeter, '
                 

In [34]:
artist_ids = set()
artist_count = 0

for r in releases.find():
    if 'medium-list' in r:
        media = r['medium-list']
        for m in media:
            if 'track-list' in m:
                tracks = m['track-list']
                for each in tracks:
                    if 'recording' in each.keys():
                        if 'artist-relation-list' in each['recording'].keys():
                            artist_list = each['recording']['artist-relation-list']
                            for a in artist_list:
                                artist_count += 1
                                artist_id = a['artist']['id']
                                artist_ids.add(artist_id)

In [35]:
artist_count

34314

In [36]:
len(artist_ids)

1503

In [37]:
current_artist_ids = set(artists.distinct('id'))
new_artist_search_ids = artist_ids - current_artist_ids

In [38]:
len(new_artist_search_ids)

1417

In [39]:
for a_id in list(new_artist_search_ids):
    artist_info = musicbrainzngs.get_artist_by_id(a_id)
    try:
        artists.insert_one(artist_info['artist'])
    except Exception as e:
        print(e)

In [40]:
artists.count()

  """Entry point for launching an IPython kernel.


1504

In [41]:
releases.count()

  """Entry point for launching an IPython kernel.


889

## Aggregations

In [42]:
# https://info-mongodb-com.s3.amazonaws.com/ReferenceCards15-PDF.pdf

# pseduo-SQL
'''
SELECT type, count(type)
FROM artists
GROUP BY type
'''

pipeline = [{"$group": {"_id": "$type", "count":{"$sum":1}}}]
artist_types = list(artists.aggregate(pipeline, allowDiskUse=True))
artist_types

[{'_id': 'Person', 'count': 1454},
 {'_id': 'Group', 'count': 25},
 {'_id': 'Orchestra', 'count': 7},
 {'_id': None, 'count': 17},
 {'_id': 'Other', 'count': 1}]

In [45]:
# pseudo-SQL
# requires an artist table and a birthplace table

'''
SELECT begin-area.name, count(begin-area.name), artist.begin-area-id
FROM begin-area
INNER JOIN artist ON begin-area.id == artist.begin-area-id
GROUP BY begin-area
'''

pipeline = [{"$group": {"_id": "$begin-area.name", "count":{"$sum":1}}},
           {"$sort": {"count":-1}}]
artist_birthplaces = list(artists.aggregate(pipeline, allowDiskUse=True))
artist_birthplaces

[{'_id': None, 'count': 640},
 {'_id': 'New York', 'count': 89},
 {'_id': 'Philadelphia', 'count': 48},
 {'_id': 'Chicago', 'count': 29},
 {'_id': 'Brooklyn', 'count': 27},
 {'_id': 'Los Angeles', 'count': 24},
 {'_id': 'Detroit', 'count': 22},
 {'_id': 'Pittsburgh', 'count': 20},
 {'_id': 'Kansas City', 'count': 12},
 {'_id': 'St. Louis', 'count': 11},
 {'_id': 'Newark', 'count': 11},
 {'_id': 'Boston', 'count': 11},
 {'_id': 'New Orleans', 'count': 9},
 {'_id': 'Memphis', 'count': 8},
 {'_id': 'Washington, D.C.', 'count': 8},
 {'_id': 'Oakland', 'count': 7},
 {'_id': 'Harlem', 'count': 7},
 {'_id': 'Dallas', 'count': 7},
 {'_id': 'Indianapolis', 'count': 7},
 {'_id': 'Cleveland', 'count': 7},
 {'_id': 'Cincinnati', 'count': 6},
 {'_id': 'Buffalo', 'count': 5},
 {'_id': 'Baltimore', 'count': 5},
 {'_id': 'Manhattan', 'count': 4},
 {'_id': 'Montreal', 'count': 4},
 {'_id': 'United States', 'count': 4},
 {'_id': 'Rio de Janeiro', 'count': 4},
 {'_id': 'Tampa', 'count': 4},
 {'_id': 'Ber