In [1]:
#!/usr/bin/env python
# Copyright 2016 Aaron ciuffo

version = '''NPR Podcast Downloader V5.0

by Aaron Ciuffo (txoof.com)
released without warranty under GPLV3:
http://www.gnu.org/licenses/gpl-3.0.html
Please don't sue me.
'''

programName = 'podcastdownload'

# Imports
from datetime import datetime # for time stuff
import pytz
import logging # logging library
from urllib2 import urlopen # standard library for interfacing with web resources
import re # regular expressions
import json # handle JSON objects
import os # Opperating System interface 
import sys # internal opperations including a list of imported modules
import argparse # parse command line arguments
import ConfigParser # parse config files



# TO DO
 * update the output path to match the date of the show and a 0 or 2 for morning/evening
 * Look at m3u; don't do anything if complete; download missing segments?

By default teh script should not repeat a download unless asked

In [2]:
def loadModules():
    '''load non standard python modules'''
    
    logging.debug('loading module: requests')
    try:
        global requests
        import requests
    except Exception as e:
        logging.critical('Fatal Error\nFailed to load module: requests\n%s', e)
        logging.critical('Please install requests module: http://docs.python-requests.org/')
        exit(2)
        return(False)

    logging.debug('loading module: mutagen.mp3')
    # create a global list of all the taggers available
    global taggers
    taggers = {}
    try:
        global MP3
        from mutagen.mp3 import EasyMP3 as MP3
    except Exception, e:
        logging.critical('Failed to load module: mutagen.mp3\n%s', e)
        logging.critical('tagging may not be available')    
    taggers['mp3'] = MP3

    
    logging.debug('loading module: mutagen.mp4')
    try:
        global MP4
        from mutagen.mp4 import MP4
    except Exception, e:
        logging.critical('Failed to load module: mutagen.mp4\n%s', e)
        logging.critical('tagging may not be available')    
    taggers['mp4'] = MP4

    return(True)

In [3]:
class Episode():
    '''Podcast episode object'''

    def __init__(self):
        '''
        Attributes:
            name (str): name of episode/podcast
            programURL (str): Index URL
            segments (list): Segment() objects to be downloaded
            outputBasePath (str): base path to use for output of files (default is ./)
            outputPath (str): full path to output the downloaded podcast files
            m3u (str): m3u playlist filename
            downloadLog (str): download log filename
        '''
        self.name = "undef" # str
        self.programURL = "undef" # str
        self.segments = [] # list
        self.showDate ='' # str
        self.outputBasePath = './' # str
        self.outputPath = self.setOutputPath() # str
        self.m3u = 'playlist'
        self.downloadLog = 'download.log'
        
    def _slugify(self, value):
        """
        Normalizes string, converts to lowercase, removes non-alpha characters,
        and converts spaces to hyphens.

        From Django's "django/template/defaultfilters.py".
        Args:
            value (str): string to be normalized for use with a filename
        
        Returns:
            unicode: sluggified string
        """
        _slugify_strip_re = re.compile(r'[^\w\s-]')
        _slugify_hyphenate_re = re.compile(r'[-\s]+')

        import unicodedata
        if not isinstance(value, unicode):
            value = unicode(value)
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
        value = unicode(_slugify_strip_re.sub('', value).strip())
        return _slugify_hyphenate_re.sub('-', value)

    
    def setOutputPath(self, episodePath = ''):
        '''
        Update the output path to include any additional path information for an episode
        Args:
            episodePath (str): additional path to be appended 
        
        Returns:
            bool: True
        '''
        self.outputPath = self.outputBasePath +  episodePath + '/'
        return(True)
    
    def setM3U(self, name = 'playlist'):
        '''
        Update the m3u file name
        Args:
            name (str): filename for the m3u
        '''
        self.m3u = self._slugify(name) + '.m3u'
        return(True)
    
    def writeM3U(self):
        '''
        Write M3U playlist for the episode in the root of the output directory
        Args:
        
        Returns:
            bool: True
        '''
        logger.info('opening m3u playlist: %s', self.m3u)
        try:
            m3ufile = open(self.outputBasePath + self.m3u, 'w')
        except Exception as e:
            logger.error('could not open m3u file: %s\n%s', self.m3u, e)
            return(False)
        logger.debug('writing segments to: %s', self.m3u)
        # recurse all the segments 
        for segment in self.segments:
            # if it was successfully downloaded write it to the m3u file
            if segment.downloaded:
                logger.debug('writing segment: %s', segment.filename)
                try:
                    m3ufile.write(segment.filename + '\n')
                except Exception as e:
                    logger.error('could not write to: %s\n%s', self.m3u, e)
                    logger.error('halting m3u writing')
                    return(False)
        # cleanup
        try:
            m3ufile.close()
        except Exception as e:
            logger.error('could not close m3u file: %s\n%s', self.m3u, e)
            return(False)
        
        return(True)
    
    
    def download(self):
        '''
        Download all segments in self.segment into self.outputPath
        Returns: 
            bool: True for successful download of all segments
        '''
        
        success = True
        
        logger.info('downloding program: %s', self.name)
        
        # check for output path
        if not os.path.isdir(self.outputPath):
            try:
                os.makedirs(self.outputPath)
            except Exception as e:
                logger.error('could not create outputpath for this episdoe at: %s\n%s', self.outputPath, e)
                logger.error('download failed')
                return(False)
        
        for segment in self.segments:
            # update the path for the current segment
            filePath = self.outputPath + segment.filename
            logger.debug('downloading %s', segment.audioURL)
            try:
                audioFile = urlopen(segment.audioURL).read()
            except Exception as e:
                logger.warning('could not download segment number: %s\nerrors follow', segment.number)
                logger.warning(e)
                success = False
                continue
                
            logger.info('writing file to %s', filePath)
            try:
                with open(filePath, 'wb') as code:
                    code.write(audioFile)
                    # record if the writing was successful
                    segment.downloaded = True
            except Exception as e:
                logging.warning('could not write segment number %s to %s\nerrors follow', segment.number, filePath)
                logging.warning(e)
                success = False
                continue
            
        self.logDownload()
        return(success)       
            
            
    def logDownload(self):
        '''
        Log successfully downloaded episodes
        Args:
        
        Returns: 
            bool: True
        '''
        logFile = self.outputBasePath + self.downloadLog
        
        logging.debug('opening log file: %s', logFile)
        try:
            f = open(logFile, 'a')
        except Exception as e:
            logging.error('could not open log file: %s\n%s', logFile, e)
            return(False)
        
        try: 
            f.write(self.outputPath + '\n')
        except Exception as e:
            logging.error('could not write to log file: %s\n%s', logFile, e)
            return(False)
        
        try:
            f.close()
        except Exception as e:
            logging.error('could not close log file: %s\n%s', logFile, e)
            return(False)
        
        return(True)
            
    
    def addSegment(self, segment):
        '''
        Add a downloadable segment to the segment list
        Args:
            segment (Segment): Segment() object containing information
        Returns:
            bool: True
        '''
        self.segments.append(segment)
        return(True)
        
            
    def tagSegments(self):
        '''
        Tag all downloaded segments
        Args:

        Returns:
            bool: True
        '''
        logger.info('tagging segments')
        for segment in self.segments:
            filename = self.outputPath + segment.filename
            try:
                filetype = re.search('\.(\w+$)', filename).group(1)
            except:
                filetype = None

            if filetype.lower() in taggers:
                logger.debug('tagging %s', filename)
                myTagger = taggers[filetype]
                audio = myTagger(filename)
                
                audio['title'] = segment.title
                audio['tracknumber'] = str(segment.number)
                audio['album'] = segment.programName
                
                try:
                    audio.save()
                except Exception as e:
                    logger.error('could not write tags for: %s\n%s', filename, e)        
            else:
                logger.info('could not tag, unknown filetype: %s', filename) 
            
        return(True)   

In [4]:
class NPREpisode(Episode, object):
    '''NPR News episode object'''
    
    def __init__(self):
        super(NPREpisode, self).__init__()
        self.jsonData = None

    def recentEpisodes(self):
        '''Identify the most recent episodes'''
        pass
        
        
    def getepisode_API():
        '''
        Use the NPR API to get a list of episodes
        Not yet implemented
        '''
        pass
    
    def getepisode_HTML(self):
        '''
        Scrape the HTML for JSON containing the date segment and title information
        Attributes set here:
            self.jsonData (json obj) - JSON listing of episodes from NPR
            self.showDate (str) - YYYY-MM-DD formatted string
            self.name (str) - human readable show name 
            self.segments (:obj: Segment) - episode segments are populated and added

        Returns: 
            bool: True if successful, False otherwise
        '''
        
        logger.info('fetching episode info via HTML method')
        logger.debug('source: %s' % self.programURL)
        
        # search terms hardcoded here
        search_PlayAll = "<b.*data-play-all='({.*})'><\/b>" #re search string for JSON data in program HTML
        search_FileName = "(^[\s|\w|\.|'|-]*)\[?|$]" #(anySpaces OR anyWords OR anyPeriod OR any' OR any-)? OR EOL
        search_showDate = "datetime=\"(\d{4}-\d{2}-\d{2})" #re search for show date
               
        
        # variables defined here
        filename = '' # extracted filename for each segment
        
        # add an extension to help differentiate between episodes; set to epoch seconds to prevent clobbering
        # if no valid extension is set elsewhere
        output_extension = int((datetime.now() - datetime.utcfromtimestamp(0)).total_seconds())
        
       
        try: # fetch the full show HTML
            programHTML = urlopen(self.programURL).read()
        except Exception as e:
            logging.warning('could not fetch episode information from %s' % self.programURL)
            logging.error(e)
            return(False)
        logger.info('HTML retrieved')
        
        # find the show date and record it 
        self.showDate = re.search(search_showDate, programHTML).group(1)
        
        if len(self.showDate) < 1:
            logger.warning('no valid showDate found')
        else: logger.info('show date: %s', self.showDate)
        
        try: # find the JSON program data
            self.jsonData = json.loads(re.search(search_PlayAll, programHTML).group(1))
        except Exception as e:
            logger.error('no valid JSON episode listing found in HTML from %s', self.programURL)
            logger.error(e)
            return(False)
        
        # check that some JSON data was found - not terribly robust
        if len(self.jsonData['audioData']) > 1:
            logger.debug('JSON program information found for %s', self.jsonData['audioData'][0]['program'].upper())
            self.name = self.jsonData['audioData'][0]['program'].upper() # set the episode name
            logger.debug('segments found: %s', len(self.jsonData['audioData']))
        else:
            logger.warn('no valid audioData found in JSON object for this program')
            return(False)
        
        # grab the first character of each word in the program name; grab the last two characters of the last word
        if len(self.name) > 0:
            short_name = '_'
            output_extension = '_'
            for each, val in enumerate(self.name.split(' ')):
                if each + 1 >= len(self.name.split(' ')):
                    char = 2
                else: 
                    char = 1
                output_extension = output_extension + val[:char]
                short_name = short_name + val[:char]


        self.setOutputPath(self.showDate + short_name) #update the output path to include the show date        
        logger.debug('output path set to: %s', self.outputPath)
        
        #set m3u name
        self.setM3U(self.showDate + '-' + self.name)
        logger.debug('m3u filename set to: %s', self.m3u)
        
        # recurse the JSON object and find all the audioData information
        for key, val in enumerate(self.jsonData['audioData']):
            logger.debug('%s - %s', int(key)+1, val['title'] )
            try:
                audioURL = val['audioUrl'] 
                title = val['title']
            except Exception as e:
                    logger.warning('failed to parse JSON data: %s', e)
                    
            number = int(key)+1 # set the human readable segment number
            filename = re.search(search_FileName, val['audioUrl'].split('/')[-1:][0]).group(1) # set the filename
            
            # append the segment number
            filename = str(number).zfill(3) + '_' + filename
            
            if filename < 1:
                logger.warning('no filename found; dropping segment')
                continue

            self.addSegment(Segment(audioURL = audioURL, filename = filename, 
                                    number = number, programName = self.name,
                                    title = title))
            
        return(True)
            

In [5]:
class Segment():
    '''One segment of a podcast'''
    
    def __init__(self, audioURL, filename, number, programName, title = None):
        '''
        Args:
            audioURL (str): URL to specific downloadable content
            filename (str): output filename
            title (str): human readable segment title
            programName (str): program Name
            number (int): ordinal number of segment
            downloaded (boo): true if segment was successfully downloaded
            
        '''
        self.audioURL = audioURL
        self.number = number
        self.filename = filename
        self.title = title
        self.programName = programName
        self.downloaded = False 

In [16]:
def main(argv=None):
    # set the default logging level
    loglevel = 'DEBUG'
    # get the numeric level from the logging module 
    numeric_level = getattr(logging, loglevel.upper())
    
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % loglevel)
    
    # create a logging instance
    logging.basicConfig(level=numeric_level)
    logger = logging.getLogger(programName)
    
    # create a file handler and set the level that shold be logged
    handler = logging.FileHandler(programName+'.log')
    handler.setLevel(numeric_level)
    
    # add the handeler to the logger object
    logger.addHandler(handler)
    
    logger.info('starting log')
    
    if not loadModules():
        logger.
    
main()


INFO:podcastdownload:starting log


In [16]:
logger.info('starting log')

loadModules()


INFO:podcastdownload:starting log
DEBUG:root:loading module: requests
DEBUG:root:loading module: mutagen.mp3
DEBUG:root:loading module: mutagen.mp4


True

In [17]:
now_utc = datetime.now(pytz.timezone('UTC'))
now_east = now_utc.astimezone(pytz.timezone('US/Eastern'))
print now_east




2017-01-16 08:52:31.068201-05:00


In [19]:
mynpr = NPREpisode()
mynpr.outputBasePath = './output/'
mynpr.setOutputPath('foo')
#mynpr.programURL = 'http://www.npr.org/programs/all-things-considered'
mynpr.programURL = 'http://www.npr.org/programs/morning-edition/'
#mynpr.programURL = 'http://www.npr.org/programs/wait-wait-dont-tell-me/'

mynpr.getepisode_HTML()

INFO:podcastdownload:fetching episode info via HTML method
DEBUG:podcastdownload:source: http://www.npr.org/programs/morning-edition/
INFO:podcastdownload:HTML retrieved
INFO:podcastdownload:show date: 2017-01-16
DEBUG:podcastdownload:JSON program information found for MORNING EDITION
DEBUG:podcastdownload:segments found: 17
DEBUG:podcastdownload:output path set to: ./output/2017-01-16_MED/
DEBUG:podcastdownload:m3u filename set to: 2017-01-16-MORNING-EDITION.m3u
DEBUG:podcastdownload:1 - Sen. Risch Says Trump Administration Will Be 'Much Stronger' On Russia
DEBUG:podcastdownload:2 - Trump's Promises Of Deportations Create Uncertainty For N.J. Family
DEBUG:podcastdownload:3 - With His Choice Of Inauguration Prayer Leaders, Trump Shows His Values
DEBUG:podcastdownload:4 - 'Indianians' No More: 'Hoosier' Gets Official Status
DEBUG:podcastdownload:5 - Ambassador To China Shares Lessons Learned With The Next Administration
DEBUG:podcastdownload:6 - Scientists Have Twisted Molecules Into Th

True

In [20]:
#mynpr.showDate + '-' +mynpr.name.replace(' ', '_') + '.m3u'
mynpr.setM3U(mynpr.showDate + '-' + mynpr.name)
#mynpr.setM3U('foobar')
print mynpr.m3u

2017-01-16-MORNING-EDITION.m3u


In [21]:
mynpr.download()

INFO:podcastdownload:downloding program: MORNING EDITION
DEBUG:podcastdownload:downloading https://ondemand.npr.org/anon.npr-mp3/npr/me/2017/01/20170113_me_sen_risch_says_trump_administration_will_be_much_stronger_on_russia.mp3?orgId=1&topicId=1014&d=331&p=3&story=509624434&t=progseg&e=509615548&seg=1&ft=nprml&f=509624434&siteplayer=true
INFO:podcastdownload:writing file to ./output/2017-01-16_MED/001_20170113_me_sen_risch_says_trump_administration_will_be_much_stronger_on_russia.mp3
DEBUG:podcastdownload:downloading https://ondemand.npr.org/anon.npr-mp3/npr/me/2017/01/20170113_me_trumps_promises_of_deportations_create_uncertainty_for_nj_family.mp3?orgId=1&topicId=1014&d=282&p=3&story=509547916&t=progseg&e=509615548&seg=2&ft=nprml&f=509547916&siteplayer=true
INFO:podcastdownload:writing file to ./output/2017-01-16_MED/002_20170113_me_trumps_promises_of_deportations_create_uncertainty_for_nj_family.mp3
DEBUG:podcastdownload:downloading https://ondemand.npr.org/anon.npr-mp3/npr/me/2017/0

True

In [19]:
mynpr.writeM3U()

INFO:podcastdownload:opening m3u playlist: 2017-01-13-MORNING-EDITION.m3u
DEBUG:podcastdownload:writing segments to: 2017-01-13-MORNING-EDITION.m3u


True

In [20]:
mynpr.logDownload()


DEBUG:root:opening log file: ./output/download.log


True

In [21]:
mynpr.tagSegments()

INFO:podcastdownload:tagging segments
DEBUG:podcastdownload:tagging ./output/2017-01-13_MED/001_20170113_me_sen_risch_says_trump_administration_will_be_much_stronger_on_russia.mp3
DEBUG:podcastdownload:tagging ./output/2017-01-13_MED/002_20170113_me_trumps_promises_of_deportations_create_uncertainty_for_nj_family.mp3
DEBUG:podcastdownload:tagging ./output/2017-01-13_MED/003_20170113_me_with_his_choice_of_inauguration_prayer_leaders_trump_shows_his_values.mp3
DEBUG:podcastdownload:tagging ./output/2017-01-13_MED/004_20170113_me_hr1_return_-_alyssa.mp3
DEBUG:podcastdownload:tagging ./output/2017-01-13_MED/005_20170113_me_exiting_ambassador_to_china_shares_lessons_for_the_next_president.mp3
DEBUG:podcastdownload:tagging ./output/2017-01-13_MED/006_20170113_me_scientists_have_twisted_molecules_into_the_tightest_knot_ever.mp3
DEBUG:podcastdownload:tagging ./output/2017-01-13_MED/007_20170113_me_the_house_always_wins_until_he_came_along.mp3
DEBUG:podcastdownload:tagging ./output/2017-01-13_M

True

http://stackoverflow.com/questions/3609852/which-is-the-best-way-to-allow-configuration-options-be-overridden-at-the-comman
I just discovered you can do this with argparse.ArgumentParser.parse_known_args(). Start by using parse_known_args() to parse a configuration file form the commandline, then read it with ConfigParser and set the defaults, and then parse the rest of the options with parse_args(). This will allow you to have a default value, override that with a configuration file and then override that with a commandline option. E.g.:

Default with no user input:

$ ./argparse-partial.py
Option is "default"
Default from configuration file:

$ cat argparse-partial.config 
[Defaults]
option=Hello world!
$ ./argparse-partial.py -c argparse-partial.config 
Option is "Hello world!"
Default from configuration file, overridden by commandline:

$ ./argparse-partial.py -c argparse-partial.config --option override
Option is "override"
argprase-partial.py follows. It is slightly complicated to handle -h for help properly.

import argparse
import ConfigParser
import sys

def main(argv=None):
    # Do argv default this way, as doing it in the functional
    # declaration sets it at compile time.
    if argv is None:
        argv = sys.argv

    # Parse any conf_file specification
    # We make this parser with add_help=False so that
    # it doesn't parse -h and print help.
    conf_parser = argparse.ArgumentParser(
        description=__doc__, # printed with -h/--help
        # Don't mess with format of description
        formatter_class=argparse.RawDescriptionHelpFormatter,
        # Turn off help, so we print all options in response to -h
        add_help=False
        )
    conf_parser.add_argument("-c", "--conf_file",
                        help="Specify config file", metavar="FILE")
    args, remaining_argv = conf_parser.parse_known_args()

    defaults = { "option":"default" }

    if args.conf_file:
        config = ConfigParser.SafeConfigParser()
        config.read([args.conf_file])
        defaults.update(dict(config.items("Defaults")))

    # Parse rest of arguments
    # Don't suppress add_help here so it will handle -h
    parser = argparse.ArgumentParser(
        # Inherit options from config_parser
        parents=[conf_parser]
        )
    parser.set_defaults(**defaults)

See ./config.py for a working test configurator

In [109]:
def main(argv = None):
    # argv is set to none so sys.argv can be used later
    
    # initial parsing of the commandline arguments
    if argv is None:
        argv = sys.argv
    
    defaults = {'fconfig':'foo.ini', 'configfile':'settings.ini'}
    
    cmdlineParser = argparse.ArgumentParser(description=__doc__, 
                                            formatter_class=argparse.RawDescriptionHelpFormatter,
                                            add_help=False)
    cmdlineParser.add_argument('-f', '--fconfig', help='fake config file', metavar='FILE',
                                action='store')
    cmdlineParser.add_argument('-c', '--configfile', help='config file', metavar='FILE', 
                              action='store')
    args, remaining_argv = cmdlineParser.parse_known_args()
    
    
    print args.configfile
    parser = argparse.ArgumentParser(parents=[cmdlineParser])
    parser.set_defaults(**defaults)
    #print parser.parse_args()
    # configure parsers
    print args.fconfig
    # 
    return(remaining_argv)

foo = main()

None
/home/txoof/.local/share/jupyter/runtime/kernel-a762d3ef-2de3-438b-a96a-b6cb304459af.json


In [106]:
foo

[]

Create an argument parser and a config parser

Good recipes
    https://pymotw.com/2/ConfigParser/index.html#module-ConfigParser
    https://pymotw.com/2/argparse/index.html#module-argparse

In [85]:
import ConfigParser
parser = ConfigParser.SafeConfigParser()
parser.read('settings.ini')

for section in parser.sections():
    print 'Section:', section
    print '   options: ', parser.options(section)
    for name, value in parser.items(section):
            print '   %s = %s' % (name, value)
    print
    

configDict = {}
configDict.update(dict(parser.items("Defaults")))

Section: Defaults
   options:  ['option', 'loglevel', 'outputpath', 'f']
   option = 'hello world!'
   loglevel = 'DEBUG'
   outputpath = './output'
   f = foobar

Section: %All Things Considered
   options:  ['url', 'fetchmethod', 'programs', 'updatedays', 'updatetime', 'timezone']
   url = http://www.npr.org/programs/all-things-considered/
   fetchmethod = HTML
   programs = 2
   updatedays = [0, 1, 2, 3, 4, 5, 6]
   updatetime = 19:00
   timezone = EST



In [110]:
import argparse
argparser = argparse.ArgumentParser()
argparser.add_argument('-f', action ="store")

argparser.set_defaults(**configDict)
argparser.parse_args()

Namespace(f='/home/txoof/.local/share/jupyter/runtime/kernel-a762d3ef-2de3-438b-a96a-b6cb304459af.json', loglevel="'DEBUG'", option="'hello world!'", outputpath="'./output'")

In [112]:
def main(argv = None):
    # argv is set to none so sys.argv can be used later

    # initial parsing of the commandline arguments
    if argv is None:
        argv = sys.argv


    cmdlineParser = argparse.ArgumentParser(description=__doc__,
                                            formatter_class=argparse.RawDescriptionHelpFormatter,
                                            add_help=False)
    cmdlineParser.add_argument('-f', '--fconfig', help='fake config file', metavar='FILE',
                                action='store')
    cmdlineParser.add_argument('-c', '--configfile', help='config file', metavar='FILE',
                              action='store', default='settings.ini')
    cmdlineParser.add_argument('-d', '--dryrun', help='preform a dry run',
                              action='store_true', default=False)

    # remaining arguments
    args, remaining_argv = cmdlineParser.parse_known_args()

    # parse the configuration file
    defaults = {}
    configParser = ConfigParser.SafeConfigParser()
    configParser.read(args.configfile)
    defaults.update(dict(configParser.items('Defaults')))





    # merge the command line with the config file options
    parser = argparse.ArgumentParser(parents=[cmdlineParser])
    parser.set_defaults(**defaults)
    # configure parsers

    #

    print parser.parse_args()


main()

Namespace(configfile='settings.ini', dryrun=False, f='foobar', fconfig='/home/txoof/.local/share/jupyter/runtime/kernel-a762d3ef-2de3-438b-a96a-b6cb304459af.json', loglevel="'DEBUG'", option="'hello world!'", outputpath="'./output'")
