-
Notifications
You must be signed in to change notification settings - Fork 9.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support Archive.org playlists download #31309
Comments
|
This is a minimal patch to support the problem URL: --- old/youtube_dl/extractor/archiveorg.py
+++ new/youtube_dl/extractor/archiveorg.py
@@ -1,9 +1,17 @@
+# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
+from ..compat import (
+ compat_filter as filter,
+ compat_urllib_parse_unquote_plus,
+)
from ..utils import (
clean_html,
extract_attributes,
+ ExtractorError,
unified_strdate,
unified_timestamp,
)
@@ -11,8 +19,12 @@
class ArchiveOrgIE(InfoExtractor):
IE_NAME = 'archive.org'
IE_DESC = 'archive.org videos'
- _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^/?#&]+)'
+ _VALID_URL = r'''(?x)
+ https?://(?:www\.)?archive\.org/
+ (?:(?P<det>details)|embed)/
+ (?P<id>(?(det)[^/]+/)?[^/?#]+)(?:[?#]|/?$)
+ '''
_TESTS = [{
'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
'md5': '8af1d4cf447933ed3c7f4871162602db',
@@ -46,8 +58,11 @@
'only_matching': True,
}]
+
def _real_extract(self, url):
- video_id = self._match_id(url)
+ video_id = compat_urllib_parse_unquote_plus(self._match_id(url))
+ video_id, entry_id = (video_id.split('/', 1) + [None])[:2]
+
webpage = self._download_webpage(
'http://archive.org/embed/' + video_id, video_id)
@@ -67,10 +82,18 @@
if jwplayer_playlist:
info = self._parse_jwplayer_data(
{'playlist': jwplayer_playlist}, video_id, base_url=url)
+ for entry in info.get('entries') or []:
+ e_id = entry.get('thumbnail')
+ if e_id:
+ e_id = self._generic_id(e_id).rsplit('/', 1)[-1]
+ e_id = re.sub(r'(?!^)_\d+$', '', e_id).replace(' ', '_')
+ e_id = (entry.get('id') or video_id).replace(video_id, '/'.join((video_id, e_id)))
+ entry['id'] = (entry.get('id') or video_id).replace(video_id, e_id)
else:
# HTML5 media fallback
info = self._parse_html5_media_entries(url, webpage, video_id)[0]
- info['id'] = video_id
+
+ info.setdefault('id', video_id)
def get_optional(metadata, field):
return metadata.get(field, [None])[0]
@@ -81,8 +104,23 @@
})['metadata']
info.update({
'title': get_optional(metadata, 'title') or info.get('title'),
- 'description': clean_html(get_optional(metadata, 'description')),
})
+ if entry_id and info.get('entries') and '.' in entry_id:
+ ext = ''.join(entry_id.rpartition('.')[1:])
+
+ def match_entry(x):
+ if not x.get('id'):
+ return False
+ return bool(re.search(
+ r'(?:^|/)%s$' % (entry_id, ),
+ x['id'] + ext))
+
+ info = next(filter(match_entry, info['entries']), None)
+ if not info:
+ raise ExtractorError('Entry %s not found in %s' % (entry_id, video_id))
+
+ if not info.get('description'):
+ info['description'] = clean_html(get_optional(metadata, 'description'))
if info.get('_type') != 'playlist':
creator = get_optional(metadata, 'creator')
info.update({ |
Checklist
archive.org playlists are broken #25466, closed as a duplicate with no link to the original issue
Description
Support Archive.org playlists download, e.g. https://archive.org/details/energy-from-the-vacuum/Energy+From+The+Vacuum/
The text was updated successfully, but these errors were encountered: