In [234]:
#!/usr/bin/env python
# Copyright 2016 Aaron ciuffo

version = '''NPR Podcast Downloader V5.0

by Aaron Ciuffo (txoof.com)
released without warranty under GPLV3:
http://www.gnu.org/licenses/gpl-3.0.html
Please don't sue me.
'''

programName = 'podcastdownload'

# Imports
from datetime import datetime # for time stuff
import pytz
import logging # logging library
from urllib2 import urlopen # standard library for interfacing with web resources
from urllib2 import URLError
import re # regular expressions
import json # handle JSON objects
import os # Opperating System interface 
import sys # internal opperations including a list of imported modules
import fnmatch # used by cleanup method in Episode
import glob # used by m3u method - consider replacing with some other library
import shutil # used by cleanup method
import argparse # parse command line arguments
import ConfigParser # parse config files



# TO DO
 * Adapt NPREpisode object to use new class attributes for output paths
 * complete the cleanup method
 * remove any 'stale' episodes
 * add a check to see if a program is already downloaded (maybe look for m3u) or at the download log
 
 

By default teh script should not repeat a download unless asked

In [12]:
def loadModules():
    '''load non standard python modules'''
    import logging
    logging.basicConfig()
    logging.debug('loading module: requests')
    try:
        global requests
        import requests
    except Exception as e:
        logging.critical('Fatal Error\nFailed to load module: requests\n%s', e)
        logging.critical('Please install requests module: http://docs.python-requests.org/')
        exit(2)
        return(False)

    logging.debug('loading module: mutagen.mp3')
    # create a global list of all the taggers available
    global taggers
    taggers = {}
    try:
        global MP3
        from mutagen.mp3 import EasyMP3 as MP3
    except Exception, e:
        logging.critical('Failed to load module: mutagen.mp3\n%s', e)
        logging.critical('mp3 tagging may not be available')    
    taggers['mp3'] = MP3

    
    logging.debug('loading module: mutagen.mp4')
    try:
        global MP4
        from mutagen.mp4 import MP4
    except Exception, e:
        logging.critical('Failed to load module: mutagen.mp4\n%s', e)
        logging.critical('mp4 tagging may not be available')    
    taggers['mp4'] = MP4

    return(True)

In [30]:
def div(num = 10, char = '*'):
    '''
    returns a multiple copies of a passed string
    Args:
        num (int): number of times to repeat string
        char (string): characters to repeat
    Returns:
        char*n (string)
    '''
    if isinstance(num, int):
        return(str(str(char)*num))
    else:
        return(str(char))

In [4]:
class Episode():
    '''Podcast episode object'''

    def __init__(self, name = 'No Name', programURL = 'undef', showDate = None, outputBasePath = './', 
                 m3u = 'playlist.m3u', downloadLog = 'download.log'):
        '''
        Args:
            name (str): name of episode/podcast
            programURL (str): Index URL containing list of files to download
            showDate (str): date of episode
            outputBasePath (str): base path to use for output of files (default is ./)
            m3u (str): m3u playlist filename
            downloadLog (str): download log filename
            
        Attributes:
            name (str): name of episode/podcast
            programURL (str): Index URL containing list of files to download
            segments (list): Segment() objects to be downloaded
            showDate (str): date of episode
            outputBasePath (str): base path to use for output of files (default is ./)
            outputShowPath (str): path within outputBasePath - slugified version of name
            outputPath (str): path within outputShowPath - set to outputShowPath by default
            m3u (str): m3u playlist filename
            downloadLog (str): download log filename
        '''
        self.name = name # str
        self.programURL = programURL # str
        self.segments = [] # list
        self.showDate = showDate # str
        self.outputBasePath = self._slash(outputBasePath) # str
        self.outputShowPath = self.outputBasePath + self._slash(self._slugify(self.name))
        self.outputPath = self.outputShowPath
        self.m3u = m3u
        self.downloadLog = downloadLog   
    
    def attributes(self, display = None):
        '''
        method to show relevant attributes of
        Args:
            display (list): list of specific attributes to display
        Retruns:
            Specific attributes
        '''
        if isinstance(display, list):
            display = display
        else:
            display = ['name', 'programURL', 'showDate', 'outputBasePath', 'outputShowPath', 'outputPath', 
                   'm3u', 'downloadLog']
        attributes = {}
        for key in self.__dict__:
            if (key in display) and (key in self.__dict__):
                attributes[key] = self.__dict__[key]
        
        return(attributes)
                
        
    
    def _slugify(self, value):
        """
        Normalizes string, converts to lowercase, removes non-alpha characters,
        and converts spaces to hyphens.

        From Django's "django/template/defaultfilters.py".
        Args:
            value (str): string to be normalized for use with a filename
        
        Returns:
            unicode: sluggified string
        """
        _slugify_strip_re = re.compile(r'[^\w\s-]')
        _slugify_hyphenate_re = re.compile(r'[-\s]+')

        import unicodedata
        if not isinstance(value, unicode):
            value = unicode(value)
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
        value = unicode(_slugify_strip_re.sub('', value).strip())
        return _slugify_hyphenate_re.sub('-', value)

    def _slash(self, value):
        '''
        Ensures path has a trailing slash
        
        Args:
            value (str): string to check and modify
        
        Returns:
            value (str): string with trailing slash
            
        '''
        if not re.match('.*\/$', value):
            logging.debug('adding trailing slash to path: %s', value)
            return(value + '/')
        else:
            return(value)
    
    def setOutputPath(self, outputShowPath = None, outputEpisodePath = None):
        '''
        Method to update the output paths
        Args:
            outputShowPath (str): path within the outputBasePath
            outputEpisodePath (str): path within outputShowPath
        Returns:
            outputEpisodePath (str)
        '''
        if outputShowPath:
            self.outputShowPath = self._slash(self.outputBasePath) + self._slash(outputShowPath)
        
        if outputEpisodePath:
            self.outputPath = self._slash(self.outputShowPath) + self._slash(outputEpisodePath)
        else:
            self.outputPath = self.outputShowPath
            
        return(self.outputPath)
    
    def setM3U(self, name = 'playlist'):
        '''
        Update the m3u file name
        Args:
            name (str): filename for the m3u
        '''
        self.m3u = self._slugify(name) + '.m3u'
        return(True)
    
    def writeM3U(self, filename = False):
        '''
        Write M3U playlist for the episode in the root of the output directory
        Args:
            filename (str): path to output filename
        Returns:
            bool: True
        '''
        
        logging.info('opening m3u playlist: %s', self.m3u)
        if filename:
            self.setm3u(filename)
        
        try:
            #m3ufile = open(self.outputBasePath + self.m3u, 'w')
            m3ufile = open(self._slash(self.outputPath) + self.m3u, 'w')
        except Exception as e:
            logging.error('could not open m3u file: %s\n%s', self.m3u, e)
            return(False)
        logging.debug('writing segments to: %s', self.m3u)
        # recurse all the segments 
        for segment in self.segments:
            # if it was successfully downloaded write it to the m3u file
            if segment.downloaded:
                logging.debug('writing segment to m3u file: %s', segment.filename)
                try:
                    #m3ufile.write(self.outputPath + segment.filename + '\n')
                    m3ufile.write(segment.filename + '\n')
                except Exception as e:
                    logging.error('could not write to: %s\n%s', self.m3u, e)
                    logging.error('halting m3u writing')
                    return(False)
        # cleanup
        try:
            m3ufile.close()
        except Exception as e:
            logging.error('could not close m3u file: %s\n%s', self.m3u, e)
            return(False)
        
        return(True)
    
    
    def download(self, dryrun = False, timeout = 5):
        '''
        Download all segments in self.segment into self.outputPath
        Args:
            dryrun (bool): When true do all other steps, but do not download and return: False
            timeout (real): time in seconds to wait for a download to complete before timing out
        
        Returns: 
            bool: True for successful download of all segments
        '''
        
        success = True
        lockfile = self.outputPath + '.' + programName + '.lock'
        logging.info('downloding program: %s', self.name)
        
        # check for output path
        logging.debug('checking for output directory: %s', self.outputPath)
        if not os.path.isdir(self.outputPath):
            logging.debug('output directory (%s) not found', self.outputPath)
            logging.debug('attempiting to create output directory')
            try:
                os.makedirs(self.outputPath)
            except Exception as e:
                logging.error('could not create outputpath for this episdoe at: %s\n%s', self.outputPath, e)
                logging.error('download failed')
                return(False)
            
            # make a 'lock file' in the folder to help with cleanup later  
            logging.debug('writing lockfile: %s', lockfile)
            try:
                with open(lockfile, 'a'):
                    os.utime(lockfile, None)
            except Exception as e:
                logging.error('could not create lockfile: %s', lockfile)
                logging.error('file error: %s', e)
        
        # check for existing m3u files; stop downloading if it exists
        if len(glob.glob(self.outputPath + '/*.m3u')) > 0:
            logging.info('episode previously downloaded; skipping')
            return(False)
        
        logging.debug('dryrun = %s', dryrun)
        logging.debug(type(dryrun))
        if dryrun:
            logging.debug('downloads will be simulated')
        for segment in self.segments:
            # update the path for the current segment
            filePath = self.outputPath + segment.filename
            logging.debug('downloading %s', segment.audioURL)
            if not dryrun:
                try:
                    audioFile = urlopen(segment.audioURL, timeout = timeout).read()
                except URLError as e:
                    logging.warning('could not download segment number: %s', segment.number)
                    logging.warning('error: %s; timeout: %s', e, timeout)
                    success = False
                    continue
            
            logging.info('writing file to %s', filePath)
            
            if not dryrun:
                try:
                    with open(filePath, 'wb') as code:
                        code.write(audioFile)
                        # record if the writing was successful
                        segment.downloaded = True
                except Exception as e:
                    logging.warning('could not write segment number %s to %s\nerrors follow', segment.number, filePath)
                    logging.warning(e)
                    success = False
                    continue
            else:
                # record succsessful downloading of all segments when doing a dry run
                segment.downloaded = True
                # Dry runs return "false"
                success = False
            
        
        self.logDownload()
            
        return(success)       
            
    def logDownload(self):
        '''
        Log successfully downloaded episodes
        Args:
        
        Returns: 
            bool: True
        '''
        logFile = self.outputBasePath + self.downloadLog
        
        logging.debug('opening log file: %s', logFile)
        try:
            f = open(logFile, 'a')
        except Exception as e:
            logging.error('could not open log file: %s\n%s', logFile, e)
            return(False)
        
        try: 
            f.write(self.outputPath + '\n')
        except Exception as e:
            logging.error('could not write to log file: %s\n%s', logFile, e)
            return(False)
        
        try:
            f.close()
        except Exception as e:
            logging.error('could not close log file: %s\n%s', logFile, e)
            return(False)
        
        return(True)
            
    
    def addSegment(self, segment):
        '''
        Add a downloadable segment to the segment list
        Args:
            segment (Segment): Segment() object containing information
        Returns:
            bool: True
        '''
        self.segments.append(segment)
        return(True)
        
            
    def tagSegments(self):
        '''
        Tag all downloaded segments
        Args:

        Returns:
            bool: True
        '''
        logging.info('tagging segments')
        for segment in self.segments:
            filename = self.outputPath + segment.filename
            try:
                filetype = re.search('\.(\w+$)', filename).group(1)
            except:
                filetype = None

            if filetype.lower() in taggers:
                logging.debug('tagging %s', filename)
                myTagger = taggers[filetype]
                audio = myTagger(filename)
                
                audio['title'] = segment.title
                audio['tracknumber'] = str(segment.number)
                audio['album'] = segment.programName
                
                try:
                    audio.save()
                except Exception as e:
                    logging.error('could not write tags for: %s\n%s', filename, e)        
            else:
                logging.info('could not tag, unknown filetype: %s', filename) 
                
    def cleanUp(self, keep = 2, dryrun = False, lockfile = '*.lock'):
        '''
        Remove stale episodes, keeping at maximum keep episodes

        Args:
            keep (int): maximum number of episodes to keep
            dryrun (bool): when true, do not actually delete anything
            lockfile (str): lockfile pattern glob to use when searching for lockfiles; default:*.lock
        Returns:
            removed (list): removed paths
        '''
      
        logging.info('cleaning up stale shows for %s', self.name)
        if not isinstance(keep, int):
            logging.error('%s is not an integer: keep')
        logging.info('keeping a maximum of %s shows', keep)
        # candididate directories that contain lockfiles for deletion
        matchdir = {}
        logging.debug('searching path: %s', self.outputShowPath)
        for root, dirnames, filenames in os.walk(self.outputShowPath):
            logging.debug('%s', root)
            for filename in fnmatch.filter(filenames, lockfile):
                logging.debug('      %s', filename)
                matchdir[root] = filename
        
        logging.debug('previously downloaded episodes found: %s', len(matchdir))
        # files to delete
        delete = []
        
        # files successfully deleted:
        removed = []
        for directory in range(0, len(sorted(matchdir))-keep):
            logging.debug('flagged for deletion: %s', sorted(matchdir)[directory])
            delete.append(sorted(matchdir)[directory])
        
        for key, val in enumerate(delete):
            lockfile = os.path.join(delete[key], matchdir[delete[key]])
            logging.debug('attempting to clean episode files in: %s', delete[key])
            # double check that a *.lock file exists before attempting a delete
            if os.path.isfile(lockfile):
                logging.debug('found lock file in path: %s', delete[key])

                if dryrun:
                    logging.debug('dryrun: simulating deletion (nothing will be removed)')
                else:
                    logging.debug('deleting path: %s\n', delete[key])
                    try:
                        shutil.rmtree(delete[key])
                        # record those paths removed
                        removed.append(delete[key])
                    except OSError as e:
                        logging.error('could not delete path: %s', e)
                    
                
            else:
                logging.warn('discovered missing lock file when attempting cleanup: %s', lockfile)
                logging.warn('manual deletion required: %s', delete[key])
                logging.warn('skipping path: %s\n', delete[key])

        return(removed)   

#myNPR = NPREpisode(name = 'Weekend Edition Saturday', programURL = 'http://www.npr.org/programs/weekend-edition-saturday/', outputBasePath = './output')
myNPR = NPREpisode(name = 'Morning Edition', programURL = 'http://www.npr.org/programs/morning-edition/', outputBasePath = './output')
myNPR.getepisode_HTML()
myNPR.download(dryrun = True, timeout = .0001)
#myNPR.writeM3U()
foo = myNPR.cleanUp(3, dryrun = True)
#myNPR.attributes(

In [5]:
class NPREpisode(Episode, object):
    '''NPR program episode object
        Args:
            name (str): name of episode/podcast
            programURL (str): Index URL containing list of files to download
            showDate (str): date of episode
            outputBasePath (str): base path to use for output of files (default is ./)
            m3u (str): m3u playlist filename
            downloadLog (str): download log filename
            jsonData 
    '''
    
    
    def __init__(self, name = 'unknown', programURL = None, outputBasePath = './', m3u ='playlist.m3u', downloadLog = 'download.log'):
        super(NPREpisode, self).__init__(name = name, programURL = programURL, 
                                         outputBasePath = outputBasePath, m3u = m3u, downloadLog = downloadLog)
        self.jsonData = None

    def recentEpisodes(self):
        '''Identify the most recent episodes
        Not yet implemented
        '''
        pass
        
        
    def getepisode_API():
        '''
        Use the NPR API to get a list of episodes
        Not yet implemented
        '''
        pass
    
    def getepisode_HTML(self):
        '''
        Scrape the HTML for JSON containing the date segment and title information
        Attributes set here:
            self.jsonData (json obj) - JSON listing of episodes from NPR
            self.showDate (str) - YYYY-MM-DD formatted string
            self.name (str) - human readable show name 
            self.segments (:obj: Segment) - episode segments are populated and added

        Returns: 
            bool: True if episode information is scraped from the HTML, False otherwise
        '''
        
        logging.info('fetching episode info via HTML method')
        logging.debug('source: %s' % self.programURL)
        
        # search terms hardcoded here
        search_PlayAll = "<b.*data-play-all='({.*})'><\/b>" #re search string for JSON data in program HTML
        search_FileName = "(^[\s|\w|\.|'|-]*)\[?|$]" #(anySpaces OR anyWords OR anyPeriod OR any' OR any-)? OR EOL
        search_showDate = "datetime=\"(\d{4}-\d{2}-\d{2})" #re search for show date
               
        
        # variables defined here
        filename = '' # extracted filename for each segment
        
        # add an extension to help differentiate between episodes; set to epoch seconds to prevent clobbering
        # if no valid extension is set elsewhere
        output_extension = int((datetime.now() - datetime.utcfromtimestamp(0)).total_seconds())
        
       
        try: # fetch the full show HTML
            programHTML = urlopen(self.programURL).read()
        except Exception as e:
            logging.warning('could not fetch episode information from %s' % self.programURL)
            logging.error(e)
            return(False)
        logging.debug('HTML retrieved successfully')
        
        # find the show date and record it 
        self.showDate = re.search(search_showDate, programHTML).group(1)
        
        if len(self.showDate) < 1:
            logging.warning('no valid showDate found')
        else: logging.info('show date: %s', self.showDate)
        
        try: # find the JSON program data
            self.jsonData = json.loads(re.search(search_PlayAll, programHTML).group(1))
        except Exception as e:
            logging.error('no valid JSON episode listing found in HTML from %s', self.programURL)
            logging.error(e)
            return(False)
        
        # check that some JSON data was found - not terribly robust
        if len(self.jsonData['audioData']) > 1:
            logging.debug('JSON program information found for %s', self.jsonData['audioData'][0]['program'].upper())
            logging.debug('setting name to: %s', self.name)
            self.name = self.jsonData['audioData'][0]['program'].upper() # set the episode name
            logging.debug('segments found: %s', len(self.jsonData['audioData']))
        else:
            logging.warn('no valid audioData found in JSON object for this program')
            return(False)
        
        # grab the first character of each word in the program name; grab the last two characters of the last word
        if len(self.name) > 0:
            short_name = '_'
            output_extension = '_'
            for each, val in enumerate(self.name.split(' ')):
                if each + 1 >= len(self.name.split(' ')):
                    char = 2
                else: 
                    char = 1
                output_extension = output_extension + val[:char]
                short_name = short_name + val[:char]

        # create a sub directory within the output path
        self.setOutputPath(outputEpisodePath = self.showDate + short_name) 
        logging.debug('output path set to: %s', self.outputPath)
        
        #set m3u name
        self.setM3U(self.showDate + '-' + self.name)
        logging.debug('m3u filename set to: %s', self.m3u)
        
        # recurse the JSON object and find all the audioData information
        for key, val in enumerate(self.jsonData['audioData']):
            logging.debug('%s - %s', int(key)+1, val['title'] )
            try:
                audioURL = val['audioUrl'] 
                title = val['title']
            except Exception as e:
                    logging.warning('failed to parse JSON data: %s', e)
                    
            number = int(key)+1 # set the human readable segment number
            filename = re.search(search_FileName, val['audioUrl'].split('/')[-1:][0]).group(1) # set the filename
            
            # append the segment number
            filename = str(number).zfill(3) + '_' + filename
            
            if filename < 1:
                logging.warning('no filename found; dropping segment')
                continue

            self.addSegment(Segment(audioURL = audioURL, filename = filename, 
                                    number = number, programName = self.name,
                                    title = title))
            
        return(True)
            

In [6]:
class Segment():
    '''One segment of a podcast'''
    
    def __init__(self, audioURL, filename, number, programName, title = None):
        '''
        Args:
            audioURL (str): URL to specific downloadable content
            filename (str): output filename
            title (str): human readable segment title
            programName (str): program Name
            number (int): ordinal number of segment
            downloaded (boo): true if segment was successfully downloaded
            
        '''
        self.audioURL = audioURL
        self.number = number
        self.filename = filename
        self.title = title
        self.programName = programName
        self.downloaded = False 

In [7]:
class showConfig():
    '''Configuration object for a downloadable show'''
   
    def __init__(self, optionsDict = {}):
        '''
        Args:
            optionsDict (dict): dictionary of options to be used in configuration
                showname (str): human readable string
                fetchmethod (str): method for downloading show (NPR_HTML or NRP_API)
                programs (int): number of programs to keep
                updatedays (list): integers [0-6] representing days of the week to update (sun-sat)
                updatetime (str): time in 24H HH:MM format after which an update should be attempted
                timezone (str): timezone in which to preform time calculatinos
                url (str): url to NPR program page
        Attributes:
            options (dict): dictionary of options
            showName (str): human readable name of show
            fetchMethod (str): method for downloading show (NPR_HTML or NPR_API)
            programs (int): number of programs to keep
            updateDays (list): integers [0-6] representing days of the week to update (sun-sat)
            updateTime (str): time in HH:MM after which an update should be attempted
            timezone (str): timezone in which to preform time calculations
            url (str): url to NPR program page
    
        '''
        
        self.options = optionsDict
        self.showName = 'No Name'
        self.fetchMethod = 'NPR_HTML'
        self.programs = 10
        self.updateDays = []
        self.updateTime = ''
        self.timezone = 'EST'
        self.url = None
        
    def verifyConfig(self):
        '''
        
        Validates and sets configuration paramaters for a downloadable show:
        
        Attributes:
            showName (str): human readable name of show
            fetchMethod (str): method for downloading show (NPR_HTML or NPR_API)
            programs (int): number of programs to keep
            updateDays (list): integers [0-6] representing days of the week to update (sun-sat)
            updateTime (str): time in HH:MM after which an update should be attempted
            timezone (str): timezone in which to preform time calculations
            
        Args:
            None
        
        Returns: 
            bool: True - configuration is OK or has been made OK
            
        '''
        
        logging.debug('verifying configuration')
        
        if 'showname' in self.options:
            self.showName = self.options['showname']
            logging.debug('show name set to: %s', self.showName)
        else: 
            self.showName = 'Unknown Show'
            logging.debug('no show name found; set to: %s', self.showName)
            
        if 'fetchmethod' in self.options:
            self.fetchMethod = self.options['fetchmethod']
            logging.debug('fetchmethod set to: %s', self.fetchMethod)
        else:
            logging.error('no fetchmethod set; setting to: %s', self.fetchMethod)
        
        if 'programs' in self.options:
            try:
                self.programs = int(self.options['programs'])
            except ValueError as e:
                logging.error('programs option not an integer: %s', e)
                logging.error('programs set to: %s', self.programs)
        else:
            logging.error('no programs setting found in configuration file; set to: %s', self.programs)
            
        defaultUpdateDays = [1, 2, 3, 4, 5, 6, 7]
        if 'updatedays' in self.options:
            # remove any non-numerals, -, or commas
            self.options['updatedays'] = re.sub('[^\,0-9]+', '', self.options['updatedays'])
            # clear out any superflous commas
            self.options['updatedays'] = re.sub('\,\,', ',', self.options['updatedays'])
            
            
            
            try:
                self.updateDays = map(int, self.options['updatedays'].split(','))
            except ValueError as e:
                logging.warn('bad or missing update date format: %s',e )
                logging.warn('using sun through sat')
                self.updateDays = defaultUpdateDays
 
            badValues = []
            for index in self.updateDays:
                # check for bad values that are less than 1 or greater than 7
                if index > 7 or index < 1:
                    logging.warn('found invalid day in configuration file: %s',index)
                    badValues.append(index)   
                    
            # get rid of bad values
            for index in badValues:
                logging.warn('removing invalid day: %s', index)
                self.updateDays.remove(index)
            # sort the list 
            self.updateDays.sort()
        else:
            # supply a list if none is supplied
            logging.warn('no update days were supplied using sun through sat')
            self.updateDays = defaultUpdateDays
        
        
        # do some validation of valid timezones
        if 'timezone' in self.options:
            if self.options['timezone'].upper() in pytz.all_timezones:
                self.timezone = self.options['timezone'].upper()
            else: 
                logging.error('specified timezone not found in database: %s', self.options['timezone'])
                logging.error('setting timezone to: UTC')
                self.timezone = 'UTC'
                
        else:
            logging.error('no timezone found; setting timezone to: UTC')
        
        # do some validation of valid times
        # time format
        timeFMT = '%H:%M'
        defaultTime = '23:59'
        if 'updatetime' in self.options:
            # sanitize the time string datetime.time(datetime.strptime('13:55', timeFMT))
            try:
                self.updateTime = datetime.time(datetime.strptime(re.sub('[^0-9\:]+', '', self.options['updatetime']), timeFMT))
            except ValueError as e:
                logging.error('bad updatetime time format: %s', self.options['updatetime'])
                logging.error('setting updatetime to: %s', defaultTime)
                self.updateTime = datetime.time(datetime.strptime(defaultTime, timeFMT))    
        else:
            self.updateTime = datetime.time(datetime.strptime(defaultTime, timeFMT))
            
        if 'url' in self.options:
            if re.match('^http:\/\/.*', self.options['url'].lower()):
                self.url = self.options['url']
            else:
                logging.error('no vlaid URL found for %s: %s', self.showName, self.options['url'])
                return(False)
        else:
            logging.error('no valid URL found for %s', self.showName)
            return(False)
        return(True)
                    

In [49]:
# for testing only
log = logging.getLogger()
# remove any extra loggers that were added throught the handler calls in the main segment
#for each in range(0, len(log.handlers)):
#    log.removeHandler(log.handlers[0])
##### REMOVE THIS #####

In [215]:
def main(argv=None):
    
    # useful for testing with jupyter/ipython when running this function over and over
    # removes handlers that should only be added once
    log = logging.getLogger()
    if len(log.handlers) > 0:
        for each in range(0, len(log.handlers)):
            log.removeHandler(log.handlers[0])
    
    
    # set the log format:
    # [  DEBUG 2017-02-12 19:14] loading module: requests
    logFormatter = logging.Formatter('[%(levelname)7s %(asctime)s] %(message)s', '%Y-%m-%d %H:%M')
    # set root logger
    rootLogger = logging.getLogger()
    
    # move move this below the configuration stuff
    # set the logging level
    rootLogger.setLevel(logging.DEBUG)

    # move below the configuration stuff
    # Add the a file handle to the root logger
    fileHandler = logging.FileHandler(programName+'.log')
    fileHandler.setFormatter(logFormatter)
    rootLogger.addHandler(fileHandler)

    
    # add a conshole handle to the root logger
    consoleHandler = logging.StreamHandler(sys.stdout)
    consoleHandler.setFormatter(logFormatter)
    rootLogger.addHandler(consoleHandler)
    
    
    logging.info('%s started log %s', div(20, '#'), div(20, '#'))
    
    
    
    #### init variables 
    # list of show configurations found in configuration file
    showsConfig = []
    
    # list of program episodes to download
    episodes = []
    
    # load non-standard python libraries
    loadModules()
    
    ##### Default values for basic command line configuration settings
    # default configuration file
    cfgFile = 'settings.ini' # this may be unneded as it set in the commandline parser by default
    ##### Default values for basic configuration settings
    
    
    # disable -h for help so the second parser can deal with this
    # http://stackoverflow.com/questions/3609852/which-is-the-best-way-to-allow-configuration-options-be-overridden-at-the-comman
    cmdlineParser = argparse.ArgumentParser(description = __doc__, 
                                           formatter_class = argparse.RawDescriptionHelpFormatter,
                                          add_help = False)
    # handle the jupyter -f option while developing in jupyter ipython notebook
    #cmdlineParser.add_argument('-f', '--fconfig', help='fake config file', action='store')
    # set the configuration file
    cmdlineParser.add_argument('-c', '--configfile', help='configuration file', metavar='FILE',
                              action='store', default = cfgFile)
    # determine if this is a dry run or not
    cmdlineParser.add_argument('-d', '--dryrun', help='preform a dry-run with no downloads',
                              action='store_true', default=False)
    cmdlineParser.add_argument('-l', '--loglevel', metavar = 'LOGLEVEL', help = 'explicitly set log level; default: warn')
    cmdlineParser.add_argument('-o', '--outputpath', action = 'store', metavar = 'PATH', 
                        help = 'path to output downloaded files')
    cmdlineParser.add_argument('-t', '--timeout', action = 'store')
    cmdlineParser.add_argument('-v', '--verbose', action = 'count', 
                        help = 'verbose mode; add more -v to increase verbosity')
    #parser.add_argument('-f', '--fake', default = None, help = 'fake option to deal with bug between jupyter and argparse; ignore this')
    
    # reamining arguments stored in unknownArgs
    args, unknownArgs = cmdlineParser.parse_known_args()
    
    # parse the configuration file
    #defaults = {}
    # create a configuration file parser
    #configParser = ConfigParser.SafeConfigParser()
    # read the config file for the "defaults" section
    #logging.debug('%s parsing config file', div(10))
    #logging.debug('reading configuraiton file: %s', args.configfile)
    #configParser.read(args.configfile)
    # pull the Defaults section from the ini
    #defaults.update(dict(configParser.items('Default')))
    
    configParser = ConfigParser.SafeConfigParser()
    # open and read configuration file set at the command line 
    configParser.read(args.configfile)
    # required options in 'Default' section in 
    # dict {'option name' : [configParser.get(float, boolean), 'default value]}
    required = {'outputpath' : [configParser.get, './DownloadedShows']}
    
    optional = {'dryrun' : [configParser.getboolean, False],
                'timeout' : [configParser.getfloat, 5], 
                'loglevel': [configParser.get, 'WARNING'],
                'useragent': [configParser.get, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2; rv:45.0) Gecko/20100101 Firefox/45.0']}
    # look for each required option and set to default if not found
    for key in required:
        try:
            default[key] = required[key][0]('Default', key)
        except (ConfigParser.NoSectionError, ConfigParser.NoOptionError) as e:
            logging.error('error in configuraiton file: %s', e)
            logging.error('using default value: %s = %s', key, required[key][1])
            defaults[key] = expected[key][1]     
    
    # search for each opitonal option and set to default if not found
    for key in optional:
        try:
            default[key] = optional[key][0]('Default', key)
        except ConfigParser.NoOptionError as e:
            logging.debug('%s option not found in configuration file', key)
            logging.debug('setting to default: %s', optional[key][1])
            default[key] = optional[key][1]
    
    # merge the command line with the config file options
    parser = argparse.ArgumentParser(parents=[cmdlineParser])
    parser.set_defaults(**default)
    
    if unknownArgs:
        logging.warning('invalid command line option(s):')
        for arg in unknownArgs:
            logging.warning('%s', arg)
        # remove the unknown args
        unknownArgs = []
    
    # merge all the parsers together
    parserArgs = parser.parse_args(unknownArgs)
        
    # verify parserArgs below:    
    if parserArgs.timeout > 120:
        logging.warn('timeout values under 120s are reccomended: %s', parserArgs.timeout)    
    
    # add a trailing '/' to the output path
    if not re.match('.*\/$', parserArgs.outputpath):
        parserArgs.outputpath = str(parserArgs.outputpath) + str('/')        
    
    # search for all the show sections in the configuration file 
    
    # this for loop is a mess; I need to deal with this in a sane way without the continue
    sectionIndex = 0
    logging.debug('%s searching config file for shows', div(10))
    for section in configParser.sections():
        # find all show descriptiors
        if '%' in section and '#' not in section:
            logging.debug('%s found show: %s', div(5), section)
            showsConfig.append(showConfig((dict(configParser.items(section)))))
            if showsConfig[sectionIndex].verifyConfig():
                # if no name is set, use the show descriptor 
                if not 'showname' in showsConfig[sectionIndex].options:
                    showsConfig[sectionIndex].showName = str(section).replace('%', '')
                    logging.warning('No show name found in configuration file\nusing section name: %s', showsConfig[sectionIndex].showName)
            else:
                logging.error('unable to verify configuration for %s; discarding', showsConfig[sectionIndex].showName)
                # remove the most last list item
                showsConfig.pop()
                # move on to the next element
                continue
            sectionIndex += 1

    # exit if no shows were found        
    if len(showsConfig) == 0:
        logging.fatal('no shows found in configuration file: %s', args.configfile)
        exit(1)
    
    # parse each show section in the configuration file, validate and add it to the list for downloading
    logging.debug('%s checking for shows to download', div(10))
    for each in showsConfig:
        # create a time object (H:M:S)
        nowTime = datetime.time(datetime.now(pytz.timezone(each.timezone)))
        # create a date object
        nowDate = datetime.date(datetime.now(pytz.timezone(each.timezone)))
        
        # create an NPREpisode object and populate
        logging.debug('%s parsing configuration for [%s]', div(5), each.showName)
        myEpisode = NPREpisode(name = each.showName, outputBasePath = parserArgs.outputpath)
        #myEpisode.outputBasePath = parserArgs.outputpath
        myEpisode.programURL = each.url
                
        # check the day of the week; if it's not in range, skip this show
        if nowDate.isoweekday() not in each.updateDays:
            logging.debug('update days in set timezone for this show (%s) does not include today: %s', each.updateDays, nowDate.isoweekday())
            logging.info('skipping episode for show: %s (wrong day)', each.showName)
            continue
        
        # check the time; if it's too early, skip the show
        if (nowTime > each.updateTime):
            logging.debug('the current time in set timezone for this show (%s) is currently later than set update time: %s', each.timezone, each.updateTime)
            logging.info('fetching episode information for show: %s', each.showName)
            # add the episode to the download list if the html fetch was successful
            if myEpisode.getepisode_HTML():
                episodes.append(myEpisode)
        else: 
            logging.debug('the current time in set timezone for this show (%s) is earlier than set update time: %s', each.timezone, each.updateTime)
            logging.info('skipping episode for show: %s (too early)', each.showName)
            
    logging.debug('%s downloading episodes', div(5))
    logging.debug('found %s episodes', len(episodes))
    for index, eachEp in enumerate(episodes):
        if eachEp.download(parserArgs.dryrun, parserArgs.timeout):
            # write m3u if there was a successful download
            eachEp.writeM3U()
            # tag segments if there was a successful download 
            #if not parserArgs.dryrun:
            eachEp.tagSegments()
        # this is a bit of a mess: this needs to be added to the episode at some point if it's going to used
        logging.debug('%s cleaning show: %s', div(5), showsConfig[index].showName)
        logging.debug('keeping a maximum of %s episodes', showsConfig[index].programs)
        eachEp.cleanUp(keep = showsConfig[index].programs, dryrun = parserArgs.dryrun)
    
    return(parserArgs)
        
    
        
#if __name__ == '__main__':
#    main()

In [230]:
foo = main()

[   INFO 2017-02-12 22:30] #################### started log ####################
[  DEBUG 2017-02-12 22:30] loading module: requests
[  DEBUG 2017-02-12 22:30] loading module: mutagen.mp3
[  DEBUG 2017-02-12 22:30] loading module: mutagen.mp4
[  DEBUG 2017-02-12 22:30] dryrun option not found in configuration file
[  DEBUG 2017-02-12 22:30] setting to default: False
[  DEBUG 2017-02-12 22:30] ********** searching config file for shows
[  DEBUG 2017-02-12 22:30] ***** found show: %All Things Considered
[  DEBUG 2017-02-12 22:30] verifying configuration
[  DEBUG 2017-02-12 22:30] show name set to: All Things Considered
[  DEBUG 2017-02-12 22:30] fetchmethod set to: NPR_HTML
[  DEBUG 2017-02-12 22:30] ***** found show: %Morning Edition
[  DEBUG 2017-02-12 22:30] verifying configuration
[  DEBUG 2017-02-12 22:30] show name set to: Morning Edition
[  DEBUG 2017-02-12 22:30] fetchmethod set to: NPR_HTML
[  DEBUG 2017-02-12 22:30] ***** found show: %Wait Wait Don't Tell Me
[  DEBUG 2017-02-12

In [232]:
from random import SystemRandom as random
bar = foo.useragent.split('|')
myRand = random()
print myRand.choice(bar)


Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36 


In [237]:
opener = build_opener()
opener.addheaders = [('User-Agent', myRand.choice(bar))]
response = opener.

In [240]:
print response.read()

<!DOCTYPE html>
<html>

<head>

<title>Stack Overflow</title>
    <link rel="shortcut icon" href="https://cdn.sstatic.net/Sites/stackoverflow/img/favicon.ico?v=4f32ecc8f43d">
    <link rel="apple-touch-icon image_src" href="https://cdn.sstatic.net/Sites/stackoverflow/img/apple-touch-icon.png?v=c78bd457575a">
    <link rel="search" type="application/opensearchdescription+xml" title="Stack Overflow" href="/opensearch.xml">
    <meta name="twitter:card" content="summary">
    <meta name="twitter:domain" content="stackoverflow.com"/>
    <meta property="og:type" content="website" />
    <meta name="description" content="Stack Overflow is the largest online community for programmers to learn, share their knowledge, and advance their careers"/>

    <meta property="og:image" itemprop="image primaryImageOfPage" content="https://cdn.sstatic.net/Sites/stackoverflow/img/apple-touch-icon@2.png?v=73d79a89bded" />
    <meta name="twitter:title" property="og:title" itemprop="title nam

In [91]:
foo = '9a0'
type(foo)
print int(foo)

ValueError: invalid literal for int() with base 10: '9a0'