Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[npr] Add new extractor #13446

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion youtube_dl/extractor/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -709,7 +709,10 @@
VPROIE,
WNLIE,
)
from .npr import NprIE
from .npr import (
NprPlaylistIE,
NprVideoIE,
)
from .nrk import (
NRKIE,
NRKPlaylistIE,
Expand Down
135 changes: 93 additions & 42 deletions youtube_dl/extractor/npr.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,79 @@
from __future__ import unicode_literals

from .common import InfoExtractor
from ..compat import compat_urllib_parse_urlencode
from ..utils import (
int_or_none,
qualities,
)
from ..utils import int_or_none


class NprIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?npr\.org/player/v2/mediaPlayer\.html\?.*\bid=(?P<id>\d+)'
class NprBaseIE(InfoExtractor):
def extract_info(self, id):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't shadow built-in names.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be private.

json_data = self._download_json(
'http://api.npr.org/query', id, query={
'id': id,
'fields': 'titles,audio,show,multimedia,text',
'format': 'json',
'apiKey': 'MDAzMzQ2MjAyMDEyMzk4MTU1MDg3ZmM3MQ010',
})

return json_data['list']['story'][0]

def extract_formats(self, media_info):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be private.

FORMATS_ = {
'hls': 'm3u8', 'hlsOnDemand': 'm3u8', 'mediastream': 'mp3',
'mp3': 'mp3', 'mp4': 'mp4', 'wm': 'wm', 'threegp': '3gp'
}

formats = []
for format_id, formats_entry in media_info.get('format', {}).items():
if not formats_entry:
continue
if isinstance(formats_entry, list):
formats_entry = formats_entry[0]
format_url = formats_entry.get('$text')
if not format_url:
continue
if format_id == 'smil':
formats += self._extract_smil_formats(
format_url,
media_info['id'],
fatal=False
)
continue
elif format_id == 'm3u8':
# Error 404 for some reason
formats += self._extract_m3u8_formats(
format_url,
media_info['id'],
fatal=False
)
continue
formats.append({
'url': format_url,
'format_id': format_id,
'ext': FORMATS_.get(format_id),
})
return formats


class NprPlaylistIE(NprBaseIE):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't change extractor name.

_VALID_URL = r'https?://(?:www\.)?npr\.org/(?:sections/\w+/\d+/\d+/\d+/|player/v2/mediaPlayer\.html\?.*\bid=)(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.npr.org/player/v2/mediaPlayer.html?id=449974205',
'url': 'http://www.npr.org/sections/allsongs/2015/10/21/449974205/new-music-from-beach-house-chairlift-cmj-discoveries-and-more',
'info_dict': {
'id': '449974205',
'title': 'New Music From Beach House, Chairlift, CMJ Discoveries And More'
},
'playlist_count': 7,
}, {
'url': 'http://www.npr.org/player/v2/mediaPlayer.html?action=1&t=1&islist=false&id=446928052&m=446929930&live=1',
'url': 'http://www.npr.org/sections/deceptivecadence/2015/10/09/446928052/music-from-the-shadows-ancient-armenian-hymns-and-piano-jazz',
'info_dict': {
'id': '446928052',
'title': "Songs We Love: Tigran Hamasyan, 'Your Mercy is Boundless'"
},
'playlist': [{
'md5': '12fa60cb2d3ed932f53609d4aeceabf1',
'md5': 'df2917b738fdd2358a9f0e7e49fcdf2e',
'info_dict': {
'id': '446929930',
'ext': 'mp3',
# 'ext': 'mp4',
'title': 'Your Mercy is Boundless (Bazum en Qo gtutyunqd)',
'duration': 402,
},
Expand All @@ -36,47 +82,52 @@ class NprIE(InfoExtractor):

def _real_extract(self, url):
playlist_id = self._match_id(url)

config = self._download_json(
'http://api.npr.org/query?%s' % compat_urllib_parse_urlencode({
'id': playlist_id,
'fields': 'titles,audio,show',
'format': 'json',
'apiKey': 'MDAzMzQ2MjAyMDEyMzk4MTU1MDg3ZmM3MQ010',
}), playlist_id)

story = config['list']['story'][0]

KNOWN_FORMATS = ('threegp', 'mp4', 'mp3')
quality = qualities(KNOWN_FORMATS)
story = self.extract_info(playlist_id)

entries = []
for audio in story.get('audio', []):
title = audio.get('title', {}).get('$text')
duration = int_or_none(audio.get('duration', {}).get('$text'))
formats = []
for format_id, formats_entry in audio.get('format', {}).items():
if not formats_entry:
continue
if isinstance(formats_entry, list):
formats_entry = formats_entry[0]
format_url = formats_entry.get('$text')
if not format_url:
continue
if format_id in KNOWN_FORMATS:
formats.append({
'url': format_url,
'format_id': format_id,
'ext': formats_entry.get('type'),
'quality': quality(format_id),
})
self._sort_formats(formats)
entries.append({
'id': audio['id'],
'title': title,
'duration': duration,
'formats': formats,
'formats': self.extract_formats(audio),
})

playlist_title = story.get('title', {}).get('$text')
return self.playlist_result(entries, playlist_id, playlist_title)


class NprVideoIE(NprBaseIE):
_VALID_URL = r'https?://(?:www\.)?npr\.org/event/music/(?P<id>\d+)'

_TEST = {
'url': 'http://www.npr.org/event/music/533198237/tigers-jaw-tiny-desk-concert',
'md5': '5b385e0e96c2731261df9a4ed1ff2cba',
'info_dict': {
'id': '533201718',
'display_id': '533198237',
'title': 'Tigers Jaw: Tiny Desk Concert',
'ext': 'mp4',
'width': 624,
'height': 351,
}
}

def _real_extract(self, url):
display_id = self._match_id(url)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not a display id.

story = self.extract_info(display_id)

video = story.get('multimedia')[0]

return {
'url': url,
'display_id': display_id,
'title': story.get('title', {}).get('$text'),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Title is mandatory.

'id': video.get('id'),
'duration': int_or_none(video.get('duration', {}).get('$text')),
'formats': self.extract_formats(video),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No url and formats at the same time.

'width': int_or_none(video.get('width', {}).get('$text')),
'height': int_or_none(video.get('height', {}).get('$text')),
}