Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace Eyed3 with Mutagen #47

Closed
wants to merge 13 commits into from
142 changes: 73 additions & 69 deletions indexer.py
@@ -1,4 +1,18 @@
# -*- coding: utf-8 -*-
"""Music indexer for the Shiva-Server API.
Index your music collection and (optionally) retrieve album covers and artist
pictures from Last.FM.

Usage:
shiva-indexer [-h] [-v] [-q] [--lastfm] [--nometadata]

Options:
-h, --help Show this help message and exit
--lastfm Retrieve artist and album covers from Last.FM API.
--nometadata Don't read file's metadata when indexing.
-v --verbose Show debugging messages about the progress.
-q --quiet Suppress warnings.
"""
# K-Pg
import logging
from datetime import datetime
Expand All @@ -7,39 +21,26 @@

from shiva import models as m
from shiva.app import app, db
from shiva.utils import ID3Manager
from shiva.utils import MetadataManager

q = db.session.query

USAGE = """Usage: %s [-h] [--lastfm] [--nometadata]

Music indexer for the Shiva-Server API.

Index your music collection and (optionally) retrieve album covers and artist
pictures from Last.FM.

optional arguments:
-h, --help Show this help message and exit
--lastfm Retrieve artist and album covers from Last.FM API.
--nometadata Don't read file's metadata when indexing.
""" % sys.argv[0]

if '--help' in sys.argv or '-h' in sys.argv:
print(USAGE)
sys.exit(0)


class Indexer(object):
def __init__(self, config=None, use_lastfm=False, no_metadata=False):
def __init__(self, config=None, use_lastfm=False, no_metadata=False, verbose=False, quiet=False):
self.config = config
self.use_lastfm = use_lastfm
self.no_metadata = no_metadata
self.verbose = verbose
self.quiet = quiet

self.count = 0
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please rename this to track_count?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, good idea.


self.session = db.session
self.media_dirs = config.get('MEDIA_DIRS', [])
self.id3r = None

self._meta = None

self.artists = {}
self.albums = {}

Expand All @@ -50,7 +51,7 @@ def __init__(self, config=None, use_lastfm=False, no_metadata=False):
api_key = config['LASTFM_API_KEY']
self.lastfm = self.pylast.LastFMNetwork(api_key=api_key)

if len(self.media_dirs) == 0:
if not len(self.media_dirs):
print("Remember to set the MEDIA_DIRS option, otherwise I don't "
'know where to look for.')

Expand Down Expand Up @@ -92,14 +93,14 @@ def get_album(self, name, artist):

def get_release_year(self, lastfm_album=None):
if not self.use_lastfm or not lastfm_album:
return self.get_id3_reader().release_year
return self.get_metadata_reader().release_year

_date = lastfm_album.get_release_date()
if not _date:
if not self.get_id3_reader().release_year:
if not self.get_metadata_reader().release_year:
return None

return self.get_id3_reader().release_year
return self.get_metadata_reader().release_year

return datetime.strptime(_date, '%d %b %Y, %H:%M').year

Expand All @@ -112,22 +113,22 @@ def save_track(self):

full_path = self.file_path.decode('utf-8')

print(self.file_path)

track = m.Track(full_path)
if self.no_metadata:
self.session.add(track)

return True
if self.verbose:
print 'Added track without metadata: %s' % full_path
return
else:
if q(m.Track).filter_by(path=full_path).count():
return True
if self.verbose:
print 'Skipped existing track: %s' % full_path
return

use_prev = None
id3r = self.get_id3_reader()
meta = self.get_metadata_reader()

artist = self.get_artist(id3r.artist)
album = self.get_album(id3r.album, artist)
artist = self.get_artist(meta.artist)
album = self.get_album(meta.album, artist)

if artist is not None and artist not in album.artists:
album.artists.append(artist)
Expand All @@ -136,73 +137,76 @@ def save_track(self):
track.artist = artist
self.session.add(track)

return True
if self.verbose:
print 'Added track: %s' % full_path

def get_id3_reader(self):
if not self.id3r or not self.id3r.same_path(self.file_path):
self.id3r = ID3Manager(self.file_path)
self.count += 1
if self.count % 10 == 0:
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's make this magic number a setting, just for clarity. Something like PERSIST_AFTER_TRACKS, or something that makes more sense.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True. I'm not even sure if we should have this at all. It speeds up re-indexing after an error occurs, but on the other side I'm not sure whether we should presist any data into the database if an error occured. What do you think?

self.session.commit()
if self.verbose:
print 'Writing to database...'

return self.id3r
def get_metadata_reader(self):
if not self._meta or self._meta.origpath != self.file_path:
self._meta = MetadataManager(self.file_path)
return self._meta

def is_track(self):
"""Tries to guess whether the file is a valid track or not.
"""
if os.path.isdir(self.file_path):
"""Try to guess whether the file is a valid track or not."""
if not os.path.isfile(self.file_path):
return False

if '.' not in self.file_path:
return False

ext = self.file_path[self.file_path.rfind('.') + 1:]
if ext not in self.config.get('ACCEPTED_FORMATS', []):
return False

if not self.get_id3_reader().is_valid():
ext = self.file_path.rsplit('.', 1)[1]
if ext not in self.get_metadata_reader().VALID_FILE_EXTENSIONS:
if not self.quiet:
msg = 'Skipped file with unknown file extension: %s'
print msg % self.file_path
return False

return True

def walk(self, dir_name):
"""Recursively walks through a directory looking for tracks.
"""
def walk(self, target):
"""Recursively walks through a directory looking for tracks."""

self.count += 1
if self.count % 10 == 0:
self.session.commit()
# If target is a file, try to save it as a track
if os.path.isfile(target):
self.file_path = target
if self.is_track():
self.save_track()

if os.path.isdir(dir_name):
for name in os.listdir(dir_name):
self.file_path = os.path.join(dir_name, name)
if os.path.isdir(self.file_path):
self.walk(self.file_path)
else:
# Otherwise, recursively walk the directory looking for files
else:
for root, dirs, files in os.walk(target):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does os.walk perform with lots of files and directories? Is it better that calling self.walk recursively? I honestly don't know, many of Shiva users have large collections, this may make a big difference.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I doubt that os.walk will be slower than os.listdir, it uses the same top-down recursive approach. And readability is better. IMO using os.listdir would be premature optimization if you don't already know that it will be faster.

I tried to measure it but it's pretty hard because there seems to be some caching going on. The first time I walked my music folder it took 5.5 seconds, the following times it was down to 1.8 to 2 seconds.

A possibility to speed up things is mentioned here: http://stackoverflow.com/a/4739544/284318 But it's filesystem specific.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I let os.walk run against my 200GB music collection which is usually at least three directory levels deep (genre, artist, album).
Also I find os.walk much more readable.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking about the fact that os.walk returns a list, if it had to read all the filesystem first and load it into memory it would be quite slow. But it has to be a generator, which makes it definitely faster than calling os.listdir recursively. 👍

for name in files:
self.file_path = os.path.join(root, name)
if self.is_track():
try:
self.save_track()
except Exception, e:
logging.warning("%s not imported - %s" % (
self.file_path, e.message))

return True
self.save_track()

def run(self):
for mobject in self.media_dirs:
for mdir in mobject.get_valid_dirs():
self.walk(mdir)


if __name__ == '__main__':
use_lastfm = '--lastfm' in sys.argv
no_metadata = '--nometadata' in sys.argv
from docopt import docopt
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A big 👍 to the introduction of docopt.

arguments = docopt(__doc__)

use_lastfm = arguments['--lastfm']
no_metadata = arguments['--nometadata']

if no_metadata:
use_lastfm = False

if use_lastfm and not app.config.get('LASTFM_API_KEY'):
print('ERROR: You need a Last.FM API key if you set the --lastfm '
sys.stderr.write('ERROR: You need a Last.FM API key if you set the --lastfm '
'flag.\n')
sys.exit(1)

lola = Indexer(app.config, use_lastfm=use_lastfm, no_metadata=no_metadata)
lola = Indexer(app.config, use_lastfm=use_lastfm, no_metadata=no_metadata, verbose=arguments['--verbose'], quiet=arguments['--quiet'])
lola.run()

# Petit performance hack: Every track will be added to the session but they
Expand Down
3 changes: 2 additions & 1 deletion requirements.pip
@@ -1,8 +1,9 @@
Flask==0.9
Flask-Restful==0.1.2
Flask-SQLAlchemy==0.16
eyed3==0.7.1
requests==1.0.4
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

😃

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🍻

translitcodec==0.3
pyLast==0.5.11
lxml==3.1beta1
mutagen==1.21
docopt==0.6.1
38 changes: 19 additions & 19 deletions shiva/models.py
Expand Up @@ -3,7 +3,7 @@

from flask.ext.sqlalchemy import SQLAlchemy

from shiva.utils import slugify as do_slug, randstr, ID3Manager
from shiva.utils import slugify as do_slug, randstr, MetadataManager

db = SQLAlchemy()

Expand Down Expand Up @@ -46,7 +46,7 @@ class Artist(db.Model):
__tablename__ = 'artists'

pk = db.Column(db.Integer, primary_key=True)
# TODO: Update the files' ID3 tags when changing this info.
# TODO: Update the files' Metadata when changing this info.
name = db.Column(db.String(128), nullable=False)
slug = db.Column(db.String(128), unique=True, nullable=False)
image = db.Column(db.String(256))
Expand Down Expand Up @@ -98,8 +98,7 @@ def __repr__(self):


class Track(db.Model):
"""
"""
"""Track model."""

__tablename__ = 'tracks'

Expand All @@ -109,7 +108,9 @@ class Track(db.Model):
slug = db.Column(db.String(128), unique=True)
bitrate = db.Column(db.Integer)
file_size = db.Column(db.Integer)
# TODO could be float if number weren't converted to an int in metadata manager
length = db.Column(db.Integer)
# TODO number should probably be renamed to track or track_number
number = db.Column(db.Integer)

lyrics = db.relationship('Lyrics', backref='track', uselist=False)
Expand All @@ -119,7 +120,7 @@ class Track(db.Model):
nullable=True)

def __init__(self, path):
if type(path) not in (unicode, str, file):
if not isinstance(path, (basestring, file)):
raise ValueError('Invalid parameter for Track. Path or File '
'expected, got %s' % type(path))

Expand All @@ -128,7 +129,7 @@ def __init__(self, path):
_path = path.name

self.set_path(_path)
self._id3r = None
self._meta = None

def __setattr__(self, attr, value):
if attr == 'title':
Expand All @@ -146,19 +147,18 @@ def set_path(self, path):
if path != self.get_path():
self.path = path
if os.path.exists(self.get_path()):
self.file_size = self.get_id3_reader().size
self.bitrate = self.get_id3_reader().bitrate
self.length = self.get_id3_reader().length
self.number = self.get_id3_reader().track_number
self.title = self.get_id3_reader().title

def get_id3_reader(self):
"""Returns an object with the ID3 info reader.
"""
if not getattr(self, '_id3r', None):
self._id3r = ID3Manager(self.get_path())

return self._id3r
meta = self.get_metadata_reader()
self.file_size = meta.filesize
self.bitrate = meta.bitrate
self.length = meta.length
self.number = meta.track_number
self.title = meta.title

def get_metadata_reader(self):
"""Return a MetadataManager object."""
if not getattr(self, '_meta', None):
self._meta = MetadataManager(self.get_path())
return self._meta

def __repr__(self):
return "<Track ('%s')>" % self.title
Expand Down
4 changes: 2 additions & 2 deletions shiva/resources.py
Expand Up @@ -499,8 +499,8 @@ def __init__(self, artist, json):
self.venue = json['venue']

def split_artists(self, json):
if len(json) == 0:
([], [])
if not len(json):
return ([], [])
elif len(json) == 1:
artist = Artist.query.filter_by(name=json[0]['name']).first()

Expand Down