In [62]:
#!pip install musicbrainzngs
#!pip install pymongo

In [76]:
import os
import time
import pprint
import datetime
from pymongo import MongoClient, IndexModel, ASCENDING, DESCENDING
import musicbrainzngs
from musicbrainzngs import *

In [77]:
# Config musicbrainz
# https://musicbrainz.org/doc/Development/XML_Web_Service/Version_1

musicbrainzngs.set_useragent(app="coffeecode", version=1, contact=None)

In [78]:
# Connect to MongoDB

client = MongoClient('localhost', 27017)

In [79]:
# Create db

db = client.jazz_catalog

In [80]:
# Create three collections
# Note that due to network limitations we will use only a subset of available Musicbrainz metadata

db.drop_collection('artists')
artists = db.artists

db.drop_collection('releases')
releases = db.releases

db.drop_collection('labels')
labels = db.labels

In [81]:
# search for an artist

seedArtist = musicbrainzngs.search_artists("miles davis")

In [82]:
for artist in seedArtist['artist-list']:
    print("{name}, {id}".format(name=artist['name'], id=artist['id']))

Miles Davis, 561d854a-6a28-4aa7-8c99-323e6ce46c2a
Miles Davis Quintet, fe7245e7-d734-4ca1-8e26-691883f58201
The Miles Davis Nonet, 16d2b8e6-8930-4c9e-856f-e6d096479350
Miles Davis Sextet, f137837b-fa55-48d1-8c85-f13961b3ac77
Miles Davis Quartet, 03606dee-b333-48fe-b9af-d671c837d40f
Miles Davis Septet, 88130878-7ee9-4ce2-b24b-1dd9da898030
Miles Davis Moody, d74d6350-a042-4202-b4fd-2a05918a06be
Miles Davis All Stars, 607d7275-2b92-49e5-a7ee-32f9cc29b076
Miles Davis + 19, 72f2e1fe-dc92-4011-93d7-2fb6aff4a2bd
The Miles Davis / Tadd Dameron Quintet, 7103c200-fd5c-4890-8685-5328a055f7ba
Miles!, f2b73b53-3414-4883-b014-622ab6917215
Miles Davis and His Orchestra, c52ff92e-b338-4815-bd63-4e9b89795e84
Miles Davis All Star Sextet, 47c342e2-5c5f-4b5a-b919-8e0ffc329272
The Shoes Of Miles Davis, addd7107-3955-4461-a62c-0b346feb5450
Miles Davis & The Modern Jazz Giants, e1b6e38e-ab75-43fc-b994-bfeea9bc7626
Miles Davis and the Lighthouse All-Stars, 4cd1a02d-9645-4172-8d45-e468e097c23b
Conrad Roberts, 

In [83]:
# Create indexes for mongodb collections
# on ID field, enforce uniqueness constraint

db.artists.create_index('id', name='artist_id', unique=True)
db.releases.create_index('id', name='release_id', unique=True)
db.labels.create_index('id', name='label_id', unique=True)

'label_id'

In [84]:
# Get artist info for top two
# Insert into db.artists

for artist in seedArtist['artist-list'][:2]:
    mbid = artist['id']
    artist_info = musicbrainzngs.get_artist_by_id(mbid)
    pprint.pprint(artist_info)

{'artist': {'area': {'id': '489ce91b-6658-3307-9877-795b68554c98',
                     'iso-3166-1-code-list': ['US'],
                     'name': 'United States',
                     'sort-name': 'United States'},
            'begin-area': {'id': '39e1719e-2604-4b59-b698-dab7caf85b33',
                           'name': 'Alton',
                           'sort-name': 'Alton'},
            'country': 'US',
            'disambiguation': 'jazz trumpeter, bandleader, songwriter',
            'end-area': {'id': 'dbacf2e3-7e3e-4cee-8804-999b109285fa',
                         'name': 'Santa Monica',
                         'sort-name': 'Santa Monica'},
            'gender': 'Male',
            'id': '561d854a-6a28-4aa7-8c99-323e6ce46c2a',
            'ipi': '00007619785',
            'ipi-list': ['00007619785'],
            'isni-list': ['000000012144707X'],
            'life-span': {'begin': '1926-05-26',
                          'end': '1991-09-28',
                          'ended'

In [85]:
for artist in seedArtist['artist-list'][:2]:
    mbid = artist['id']
    artist_info = musicbrainzngs.get_artist_by_id(mbid)
    artists.insert_one(artist_info['artist'])

In [86]:
# Try again to check uniqueness constraint

for artist in seedArtist['artist-list'][:2]:
    mbid = artist['id']
    artist_info = musicbrainzngs.get_artist_by_id(mbid)
    try:
        artists.insert_one(artist_info['artist'])
    except Exception as e:
        print(e)

E11000 duplicate key error collection: jazz_catalog.artists index: artist_id dup key: { : "561d854a-6a28-4aa7-8c99-323e6ce46c2a" }
E11000 duplicate key error collection: jazz_catalog.artists index: artist_id dup key: { : "fe7245e7-d734-4ca1-8e26-691883f58201" }


In [100]:
db.artists.count()

2

In [87]:
for a in artists.find():
    pprint.pprint(a)

{'_id': ObjectId('5c06e40366267d2c60267001'),
 'area': {'id': '489ce91b-6658-3307-9877-795b68554c98',
          'iso-3166-1-code-list': ['US'],
          'name': 'United States',
          'sort-name': 'United States'},
 'begin-area': {'id': '39e1719e-2604-4b59-b698-dab7caf85b33',
                'name': 'Alton',
                'sort-name': 'Alton'},
 'country': 'US',
 'disambiguation': 'jazz trumpeter, bandleader, songwriter',
 'end-area': {'id': 'dbacf2e3-7e3e-4cee-8804-999b109285fa',
              'name': 'Santa Monica',
              'sort-name': 'Santa Monica'},
 'gender': 'Male',
 'id': '561d854a-6a28-4aa7-8c99-323e6ce46c2a',
 'ipi': '00007619785',
 'ipi-list': ['00007619785'],
 'isni-list': ['000000012144707X'],
 'life-span': {'begin': '1926-05-26', 'end': '1991-09-28', 'ended': 'true'},
 'name': 'Miles Davis',
 'sort-name': 'Davis, Miles',
 'type': 'Person'}
{'_id': ObjectId('5c06e40466267d2c60267002'),
 'area': {'id': '489ce91b-6658-3307-9877-795b68554c98',
          'iso-316

In [92]:
# Get the first set of releases with 'Official' status
# (keep result sets to default 25 to minimize network impact)
# Make a second call for complete track info and
# Insert response into mongodb releases collection

for a in artists.find():
    artist_id = a['id']
    release_search = musicbrainzngs.get_artist_by_id(artist_id, includes=['releases',  'release-rels', 'label-rels'])
    release_list = release_search['artist']['release-list']
    for r in release_list:
        if 'status' in r.keys():
            if r['status'] == "Official":
                release_id = r['id']
                release_info = musicbrainzngs.get_release_by_id(release_id,
                                                                includes=['recordings', 'recording-level-rels', 'artist-rels'])
                try:
                    releases.insert_one(release_info['release'])
                except Exception as e:
                    print(e)

In [99]:
db.releases.count()

45

In [108]:
a_release = releases.find_one()

In [113]:
tracks = a_release['medium-list'][0]['track-list']

In [115]:
media = a_release['medium-list']

In [116]:
len(media)

1

In [117]:
for each in media:
    tracks = each['track-list']
    print(len(tracks))

6


In [119]:
tracks = media[0]['track-list']

In [120]:
tracks[0]

{'id': '37e8f384-fd8f-3980-ab99-c69271a21204',
 'length': '220360',
 'number': '1',
 'position': '1',
 'recording': {'artist-relation-list': [{'artist': {'id': '534362cd-66ed-45a8-9cba-b847dbf75b6f',
     'name': 'Percy Heath',
     'sort-name': 'Heath, Percy'},
    'attribute-list': ['bass'],
    'attributes': [{'attribute': 'bass'}],
    'begin': '1954-03-06',
    'direction': 'backward',
    'end': '1954-03-06',
    'ended': 'true',
    'target': '534362cd-66ed-45a8-9cba-b847dbf75b6f',
    'type': 'instrument',
    'type-id': '59054b12-01ac-43ee-a618-285fd397e461'},
   {'artist': {'disambiguation': 'jazz trumpeter, bandleader, songwriter',
     'id': '561d854a-6a28-4aa7-8c99-323e6ce46c2a',
     'name': 'Miles Davis',
     'sort-name': 'Davis, Miles'},
    'attribute-list': ['trumpet'],
    'attributes': [{'attribute': 'trumpet'}],
    'begin': '1954-03-06',
    'direction': 'backward',
    'end': '1954-03-06',
    'ended': 'true',
    'target': '561d854a-6a28-4aa7-8c99-323e6ce46c2a'

In [122]:
# avoid duplicate requests later - make a set of IDs

artist_ids = set()
artist_count = 0

for each in tracks:
    if 'recording' in each.keys():
        if 'artist-relation-list' in each['recording'].keys():
            artist_list = each['recording']['artist-relation-list']
            for a in artist_list:
                artist_count += 1
                artist_id = a['artist']['id']
                artist_name = a['artist']['name']
                artist_ids.add(artist_id)

In [123]:
print(artist_count)

15


In [124]:
len(artist_ids)

5

In [125]:
artist_ids

{'534362cd-66ed-45a8-9cba-b847dbf75b6f',
 '561d854a-6a28-4aa7-8c99-323e6ce46c2a',
 '601e7466-eaf5-4a91-9909-ffd770b7e04a',
 'c332fcf2-cc5c-424e-a5ea-317b31f6c035',
 'd185d986-ee96-4fd3-bd61-8c848a4765b6'}

In [127]:
artists.distinct('id')

['561d854a-6a28-4aa7-8c99-323e6ce46c2a',
 'fe7245e7-d734-4ca1-8e26-691883f58201']

In [128]:
current_artist_ids = set(artists.distinct('id'))

In [131]:
new_artist_search_ids = artist_ids - current_artist_ids

In [132]:
new_artist_search_ids

{'534362cd-66ed-45a8-9cba-b847dbf75b6f',
 '601e7466-eaf5-4a91-9909-ffd770b7e04a',
 'c332fcf2-cc5c-424e-a5ea-317b31f6c035',
 'd185d986-ee96-4fd3-bd61-8c848a4765b6'}

In [133]:
# so now get all artists info for all tracks on all releases

artist_ids = set()
artist_count = 0

for r in releases.find():
    if 'medium-list' in r:
        media = r['medium-list']
        for m in media:
            if 'track-list' in m:
                tracks = m['track-list']
                for each in tracks:
                    if 'recording' in each.keys():
                        if 'artist-relation-list' in each['recording'].keys():
                            artist_list = each['recording']['artist-relation-list']
                            for a in artist_list:
                                artist_count += 1
                                artist_id = a['artist']['id']
                                artist_ids.add(artist_id)

In [134]:
artist_count

1455

In [136]:
len(artist_ids)

86