In [22]:
#!/usr/bin/env python
# Copyright 2016 Aaron ciuffo

version = '''NPR Podcast Downloader V5.0

by Aaron Ciuffo (txoof.com)
released without warranty under GPLV3:
http://www.gnu.org/licenses/gpl-3.0.html
Please don't sue me.
'''

programName = 'podcastdownload'
# regexp for finding titles titleList = re.findall(r"audio-module-title\"\>(.*)\<, file)
# regexp for finding urls urlList = re.findall(r"download.*href\=\"(https:\/\/ondemand.npr.org\/.*mp3).*\?", file)


# Imports
import datetime # for time stuff
import logging # logging library
from urllib2 import urlopen # standard library for interfacing with web resources
import re # regular expressions
import json


# TO DO
 * update the output path to match the date of the show and a 0 or 2 for morning/evening


In [2]:
loglevel = 'DEBUG'
numeric_level = getattr(logging, loglevel.upper())
if not isinstance(numeric_level, int):
    raise ValueError('Invalid log level: %s' % loglevel)
logging.basicConfig(level=numeric_level)
logger = logging.getLogger(programName)

# create a file handler
handler = logging.FileHandler(programName+'.log')
handler.setLevel(numeric_level)

# create a logging format
formatter = logging.Formatter('%(asctime)s %(name)s[%(levelname)-8s]: %(message)s')
handler.setFormatter(formatter)

# add the handlers to the logger
logger.addHandler(handler)


In [3]:
logger.info('starting log')

INFO:podcastdownload:starting log


In [None]:
def loadModules():
    '''load non standard modules'''
    
    try:
        global requests
        import requests
    except Exception as e:
        logging.critical('Fatal Error\nFailed to load module: requests')
        logging.critical('Please install requests module: http://docs.python-requests.org/')
        exit(2)
        return(False)
    return(True)

In [None]:
loadModules()


In [149]:
class Episode():
    '''Download a podcast episode'''

    def __init__(self,):
        '''
        Args:
            name (str): name of episode/podcast
            programURL (str): Index URL
            segments (list): Segment() objects to be downloaded
            outputBasePath (str): base path to use for output of files (default is ./)
            outputPath (str): full path to output the downloaded podcast files
        '''
        self.name = "undef"
        self.programURL = "undef"
        self.segments = []
        self.showDate =''
        self.outputBasePath = './'
        self.outputPath = self.updateOutputPath()
        
    def updateOutputPath(self, episodePath = ''):
        self.outputPath = self.outputBasePath +  episodePath + '/'
    
    def download(self):
        '''
        Download all segments
        Returns: 
            bool: True for successful
        '''
        
        logger.info('downloding program: %s', self.name)
        for segment in self.segments:
            filePath = self.outputPath + segment.filename
            
            logger.debug('downloading %s', segment.audioURL)
            try:
                audioFile = urlopen(segment.audioURL).read()
            except Exception as e:
                logger.warning('could not download segment number: %s\nerrors follow', segment.number)
                logger.warning(e)
                continue
                
            logger.info('writing file to %s', filePath)
            try:
                with open(output, 'wb') as code:
                    code.write(audioFile)
            except Exception as e:
                logging.warning('could not write segment number %s to %s\nerrors follow' % segment.number, filePath)
                logging.warning(e)
                continue

            
    def addSegment(self, segment):
        '''
        Add a downloadable segment to the segment list'''
        self.segments.append(segment)
            
    

In [259]:
class NPREpisode(Episode, object):
    '''Download an episode from NPR'''
    
    def __init__(self):
        super(NPREpisode, self).__init__()
        self.jsonData = None


    def getepisode_API():
        '''Use the NPR API to get a list of episodes'''
        pass
        
    
    def getepisode_HTML(self):
        '''
        scrape the HTML for JSON containing the segment and title information
            Returns: True if successful, false otherwise
        '''
        logger.info('fetching episode info via HTML method')
        logger.debug('source: %s' % self.programURL)
        
        search_PlayAll = "<b.*data-play-all='({.*})'><\/b>" #re search string for JSON data in program HTML
        search_FileName = "(^[\s|\w|\.|'|-]*)\[?|$]" #(anySpaces OR anyWords OR anyPeriod OR any' OR any-)? OR EOL
        search_showDate = "datetime=\"(\d{4}-\d{2}-\d{2})"
               
        filename = '' # extracted filename for each segment
        # add an extension to help differentiate between episodes; set to epoch seconds to prevent clobbering
        # if no valid extension is set elsewhere
        output_extension = int((datetime.datetime.now() - datetime.datetime.utcfromtimestamp(0)).total_seconds())

        
       
        try: # fetch the full HTML
            programHTML = urlopen(self.programURL).read()
        except Exception as e:
            logging.warning('could not fetch episode information from %s' % self.programURL)
            logging.error(e)
            return(False)
        logger.info('HTML retrieved')
        

        # find the show date and record it 
        self.showDate = re.search(search_showDate, programHTML).group(1)
        if len(self.showDate) < 1:
            logger.warning('no valid showDate found')
        else: logger.info('show date: %s', self.showDate)
        
        try: # find the JSON program data
            self.jsonData = json.loads(re.search(search_PlayAll, programHTML).group(1))
        except Exception as e:
            logger.warning('no valid JSON episode listing found in HTML from %s', self.programURL)
            logger.error(e)
            return(False)
        
        # check that some JSON data was found - not terribly robust
        if len(self.jsonData['audioData']) > 1:
            logger.info('JSON program information found for %s', self.jsonData['audioData'][0]['program'].upper())
            self.name = self.jsonData['audioData'][0]['program'].upper() # set the episode name
            logger.debug('segments found: %s', len(self.jsonData['audioData']))
        else:
            logger.warn('no valid audioData found in JSON object for this program')
            return(False)
        
        # grab the first character of each word in the name; grab the last two characters of the last word
        if len(self.name) > 0:
            output_extension = '_'
            for each, val in enumerate(self.name.split(' ')):
                if each + 1 >= len(self.name.split(' ')):
                    char = 2
                else: 
                    char = 1
                output_extension = output_extension + val[:char]

        logger.debug('output path set to: %s', self.showDate + output_extension)
        self.updateOutputPath(self.showDate + output_extension) # update the output path to include the show date
        
        
        # recurse the JSON object and find all the audioData information
        for key, val in enumerate(self.jsonData['audioData']):
            logger.debug('%s - %s', int(key)+1, val['title'] )
            try:
                audioURL = val['audioUrl'] 
                title = val['title']
            except Exception as e:
                    logger.warning('failed to parse JSON data: %s', e)
                    
            number = int(key)+1 # set the human readable segment number
            filename = re.search(search_FileName, val['audioUrl'].split('/')[-1:][0]).group(1) # set the filename
            
            if filename < 1:
                logger.warning('no filename found; dropping segment')
                continue

            self.addSegment(Segment(audioURL, filename, number, title))
            
        return(True)
            

In [252]:
class Segment():
    '''One segment of a podcast'''
    
    def __init__(self, audioURL, filename, number, title = None):
        '''
        Args:
            URL (str): URL to specific downloadable content
            filename (str): output filename
            number (int): ordinal number of segment
        '''
        self.audioURL = audioURL
        self.number = number
        self.filename = filename
        self.title = title

In [260]:
mynpr = NPREpisode()
mynpr.outputBasePath = './output/'
mynpr.updateOutputPath('foo')
#mynpr.programURL = 'http://www.npr.org/programs/all-things-considered'
mynpr.programURL = 'http://www.npr.org/programs/morning-edition/'
mynpr.getepisode_HTML()

INFO:podcastdownload:fetching episode info via HTML method
DEBUG:podcastdownload:source: http://www.npr.org/programs/morning-edition/
INFO:podcastdownload:HTML retrieved
INFO:podcastdownload:show date: 2016-10-28
INFO:podcastdownload:JSON program information found for MORNING EDITION
DEBUG:podcastdownload:segments found: 19
DEBUG:podcastdownload:output path set to: 2016-10-28_MED
DEBUG:podcastdownload:1 - Nevada Neighbors Favor Different Presidential Candidates
DEBUG:podcastdownload:2 - Protests Escalate Against Venezuela's President
DEBUG:podcastdownload:3 - Oregon Jury Acquits Defendants In Wildlife Refuge Occupation
DEBUG:podcastdownload:4 - The Social Science Research Behind Political Campaign Ads
DEBUG:podcastdownload:5 - In Chechnya, Take Care How You Dance At Weddings
DEBUG:podcastdownload:6 - 'Economist' Correspondent Spends Time On Migrant Rescue Ship
DEBUG:podcastdownload:7 - Democrats Focus On Rural New York To Retake The U.S. House
DEBUG:podcastdownload:8 - In Israel, A Pus

True

In [261]:
mynpr.outputPath

u'./output/2016-10-28_MED/'

In [32]:
mynpr.download()

INFO:podcastdownload:downloading https://ondemand.npr.org/anon.npr-mp3/npr/me/2016/10/20161028_me_nevada_neighbors_favor_different_presidential_candidates.mp3?orgId=1&topicId=1014&d=433&p=3&story=499710808&t=progseg&e=499700973&seg=1&siteplayer=true
INFO:podcastdownload:writing file to ./output//20161028_me_nevada_neighbors_favor_different_presidential_candidates.mp3
INFO:podcastdownload:downloading https://ondemand.npr.org/anon.npr-mp3/npr/me/2016/10/20161028_me_protests_escalate_against_venezuelas_president.mp3?orgId=1&topicId=1127&d=188&p=3&story=499710815&t=progseg&e=499700973&seg=2&siteplayer=true
INFO:podcastdownload:writing file to ./output//20161028_me_protests_escalate_against_venezuelas_president.mp3
INFO:podcastdownload:downloading https://ondemand.npr.org/anon.npr-mp3/npr/me/2016/10/20161028_me_oregon_jury_acquits_defendants_in_wildlife_refuge_occupation.mp3?orgId=1&topicId=1070&d=199&p=3&story=499710822&t=progseg&e=499700973&seg=3&siteplayer=true
INFO:podcastdownload:writi

In [23]:
datetime.datetime.today().strftime('%Y-%m-%d')

'2016-10-28'