Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[npr] Add new extractor #13446

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion youtube_dl/extractor/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -709,7 +709,10 @@
VPROIE,
WNLIE,
)
from .npr import NprIE
from .npr import (
NprIE,
NprVideoIE,
)
from .nrk import (
NRKIE,
NRKPlaylistIE,
Expand Down
140 changes: 101 additions & 39 deletions youtube_dl/extractor/npr.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,88 @@
from __future__ import unicode_literals

from .common import InfoExtractor
from ..compat import compat_urllib_parse_urlencode
from ..utils import (
int_or_none,
qualities,
ExtractorError
)


class NprIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?npr\.org/player/v2/mediaPlayer\.html\?.*\bid=(?P<id>\d+)'
class NprBaseIE(InfoExtractor):
def _extract_info(self, id_):
json_data = self._download_json(
'http://api.npr.org/query', id_, query={
'id': id_,
'fields': 'titles,audio,show,multimedia,text',
'format': 'json',
'apiKey': 'MDAzMzQ2MjAyMDEyMzk4MTU1MDg3ZmM3MQ010',
})

return json_data['list']['story'][0]

def _extract_formats(self, media_info):
FORMATS_ = {
'hls': 'm3u8', 'hlsOnDemand': 'm3u8', 'mediastream': 'mp3',
'mp3': 'mp3', 'mp4': 'mp4', 'wm': 'wm', 'threegp': '3gp'
}

formats = []
for format_id, formats_entry in media_info.get('format', {}).items():
if not formats_entry:
continue
if isinstance(formats_entry, list):
formats_entry = formats_entry[0]
format_url = formats_entry.get('$text')
if not format_url:
continue
if format_id == 'smil':
formats += self._extract_smil_formats(
format_url,
media_info['id'],
fatal=False
)
continue
elif format_id == 'm3u8':
try:
formats += self._extract_m3u8_formats(
format_url.replace('200000', '2000000'),
media_info['id']
)
except ExtractorError:
formats += self._extract_m3u8_formats(
format_url,
media_info['id'],
fatal=False
)
continue
formats.append({
'url': format_url,
'format_id': format_id,
'ext': FORMATS_.get(format_id),
})
self._sort_formats(formats)
return formats


class NprIE(NprBaseIE):
_VALID_URL = r'https?://(?:www\.)?npr\.org/(?:sections/\w+/\d+/\d+/\d+/|player/v2/mediaPlayer\.html\?.*\bid=)(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.npr.org/player/v2/mediaPlayer.html?id=449974205',
'url': 'http://www.npr.org/sections/allsongs/2015/10/21/449974205/new-music-from-beach-house-chairlift-cmj-discoveries-and-more',
'info_dict': {
'id': '449974205',
'title': 'New Music From Beach House, Chairlift, CMJ Discoveries And More'
},
'playlist_count': 7,
}, {
'url': 'http://www.npr.org/player/v2/mediaPlayer.html?action=1&t=1&islist=false&id=446928052&m=446929930&live=1',
'url': 'http://www.npr.org/sections/deceptivecadence/2015/10/09/446928052/music-from-the-shadows-ancient-armenian-hymns-and-piano-jazz',
'info_dict': {
'id': '446928052',
'title': "Songs We Love: Tigran Hamasyan, 'Your Mercy is Boundless'"
},
'playlist': [{
'md5': '12fa60cb2d3ed932f53609d4aeceabf1',
'md5': 'df2917b738fdd2358a9f0e7e49fcdf2e',
'info_dict': {
'id': '446929930',
'ext': 'mp3',
'ext': 'mp4',
'title': 'Your Mercy is Boundless (Bazum en Qo gtutyunqd)',
'duration': 402,
},
Expand All @@ -36,47 +91,54 @@ class NprIE(InfoExtractor):

def _real_extract(self, url):
playlist_id = self._match_id(url)

config = self._download_json(
'http://api.npr.org/query?%s' % compat_urllib_parse_urlencode({
'id': playlist_id,
'fields': 'titles,audio,show',
'format': 'json',
'apiKey': 'MDAzMzQ2MjAyMDEyMzk4MTU1MDg3ZmM3MQ010',
}), playlist_id)

story = config['list']['story'][0]

KNOWN_FORMATS = ('threegp', 'mp4', 'mp3')
quality = qualities(KNOWN_FORMATS)
story = self._extract_info(playlist_id)

entries = []
for audio in story.get('audio', []):
title = audio.get('title', {}).get('$text')
duration = int_or_none(audio.get('duration', {}).get('$text'))
formats = []
for format_id, formats_entry in audio.get('format', {}).items():
if not formats_entry:
continue
if isinstance(formats_entry, list):
formats_entry = formats_entry[0]
format_url = formats_entry.get('$text')
if not format_url:
continue
if format_id in KNOWN_FORMATS:
formats.append({
'url': format_url,
'format_id': format_id,
'ext': formats_entry.get('type'),
'quality': quality(format_id),
})
self._sort_formats(formats)
entries.append({
'id': audio['id'],
'title': title,
'duration': duration,
'formats': formats,
'formats': self._extract_formats(audio),
})

playlist_title = story.get('title', {}).get('$text')
return self.playlist_result(entries, playlist_id, playlist_title)


class NprVideoIE(NprBaseIE):
_VALID_URL = r'https?://(?:www\.)?npr\.org/event/music/(?P<id>\d+)'

_TEST = {
'url': 'http://www.npr.org/event/music/533198237/tigers-jaw-tiny-desk-concert',
'md5': '2ca640c9725579ea7d020dd23b9cffc2',
'info_dict': {
'id': '533198237',
'title': 'Tigers Jaw: Tiny Desk Concert',
'ext': 'm3u8',
'width': 1280,
'height': 720,
},
'expected_warnings': ['HTTP Error 404'],
}

def _real_extract(self, url):
video_id = self._match_id(url)
story = self._extract_info(video_id)

title = story.get('title', {}).get('$text')
if title is None:
ExtractorError('Fail extracting title')

video = story.get('multimedia')[0]

return {
'id': video_id,
'title': title,
'duration': int_or_none(video.get('duration', {}).get('$text')),
'formats': self._extract_formats(video),
'width': int_or_none(video.get('width', {}).get('$text')),
'height': int_or_none(video.get('height', {}).get('$text')),
}