diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index aefadc56f6a..e015112b01b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -465,6 +465,7 @@ from .iprima import IPrimaIE from .iqiyi import IqiyiIE from .ir90tv import Ir90TvIE +from .itunes import iTunesIE from .itv import ITVIE from .ivi import ( IviIE, diff --git a/youtube_dl/extractor/itunes.py b/youtube_dl/extractor/itunes.py new file mode 100644 index 00000000000..024d1098b99 --- /dev/null +++ b/youtube_dl/extractor/itunes.py @@ -0,0 +1,58 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + +from ..utils import ( + extract_attributes, + int_or_none, + unescapeHTML, + unified_strdate, +) + + +class iTunesIE(InfoExtractor): + _VALID_URL = r'https?://itunes\.apple\.com/[a-z]{2}?/?[a-z0-9-]+/?(?P[a-z0-9-]+)?/(?:id)?(?P[0-9]+)' + _TEST = { + 'url': 'https://itunes.apple.com/us/itunes-u/uc-davis-symphony-orchestra/id403834767', + 'info_dict': { + 'id': '403834767', + 'title': 'UC Davis Symphony Orchestra & University Chorus', + }, + 'playlist_count': 31, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id, display_id = mobj.group('id', 'display_id') + if not display_id: + display_id = playlist_id + + webpage = self._download_webpage(url, display_id) + + video_infos = re.findall(r'var\s+__desc_popup_d_\d+\s*=\s*({[^><]+});', webpage) + html_entries = re.findall(r']*role="row"[^>]+>', webpage) + + entries = [] + for idx, html_entry in enumerate(html_entries): + video_info = self._parse_json(video_infos[idx], display_id) + entry = extract_attributes(html_entry) + entries.append({ + 'id': entry['adam-id'], + 'title': entry['preview-title'], + 'description': video_info.get('description'), + 'url': entry.get('audio-preview-url', entry.get('video-preview-url')), + 'duration': int_or_none(entry.get('duration')), + 'release_date': unified_strdate(video_info.get('release_date')), + 'track': unescapeHTML(entry.get('preview-title')), + 'track_number': int_or_none(entry.get('row-number')), + 'track_id': entry.get('adam-id'), + 'artist': unescapeHTML(entry.get('preview-artist')), + 'album': unescapeHTML(entry.get('preview-album')), + }) + + title = self._html_search_regex(r'

(.+)

', webpage, 'title') + + return self.playlist_result(entries, playlist_id, title)