Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[la7] Improvements to the extractor #1575

Merged
merged 10 commits into from Nov 9, 2021
1 change: 1 addition & 0 deletions yt_dlp/__init__.py
Expand Up @@ -715,6 +715,7 @@ def report_args_compat(arg, name):
'extractor_args': opts.extractor_args,
'youtube_include_dash_manifest': opts.youtube_include_dash_manifest,
'youtube_include_hls_manifest': opts.youtube_include_hls_manifest,
'generate_filesize': opts.generate_filesize,
'encoding': opts.encoding,
'extract_flat': opts.extract_flat,
'mark_watched': opts.mark_watched,
Expand Down
12 changes: 12 additions & 0 deletions yt_dlp/extractor/common.py
Expand Up @@ -53,6 +53,7 @@
format_field,
GeoRestrictedError,
GeoUtils,
HEADRequest,
int_or_none,
js_to_json,
JSON_LD_RE,
Expand Down Expand Up @@ -3598,6 +3599,17 @@ def _generic_id(self, url):
def _generic_title(self, url):
return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])

def _generate_filesize(self, url, url_id):
if self.get_param('generate_filesize', False):
urlh = self._request_webpage(
HEADRequest(url), url_id,
note='Check filesize', fatal=False)
if urlh:
return int_or_none(urlh.headers.get('Content-Length', None))
else:
self.to_screen('With --generate-filesize you can enable filesize calculation for some video formats.')
return None

@staticmethod
def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
all_known = all(map(
Expand Down
54 changes: 41 additions & 13 deletions yt_dlp/extractor/la7.py
Expand Up @@ -8,7 +8,6 @@
determine_ext,
float_or_none,
parse_duration,
smuggle_url,
unified_strdate,
)

Expand All @@ -25,19 +24,33 @@ class LA7IE(InfoExtractor):
'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722',
'md5': '8b613ffc0c4bf9b9e377169fc19c214c',
'info_dict': {
'id': '0_42j6wd36',
'id': 'inccool8-02-10-2015-163722',
'ext': 'mp4',
'title': 'Inc.Cool8',
'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto atletico',
'thumbnail': 're:^https?://.*',
'uploader_id': 'kdla7pillole@iltrovatore.it',
'timestamp': 1443814869,
'upload_date': '20151002',
},
}, {
'url': 'http://www.la7.it/omnibus/rivedila7/omnibus-news-02-07-2016-189077',
'only_matching': True,
}]
_HOST = 'https://awsvodpkg.iltrovatore.it'

def _generate_mp4_url(self, quality, m3u8_formats):
for f in m3u8_formats:
if f['vcodec'] != 'none' and quality in f['url']:
http_url = '%s%s.mp4' % (self._HOST, quality)

http_f = f.copy()
del http_f['manifest_url']
http_f.update({
'format_id': http_f['format_id'].replace('hls-', 'https-'),
'url': http_url,
'protocol': 'https',
'filesize_approx': self._generate_filesize(http_url, quality),
})
return http_f

def _real_extract(self, url):
video_id = self._match_id(url)
Expand All @@ -47,21 +60,36 @@ def _real_extract(self, url):

webpage = self._download_webpage(url, video_id)

player_data = self._search_regex(
[r'(?s)videoParams\s*=\s*({.+?});', r'videoLa7\(({[^;]+})\);'],
webpage, 'player data')
vid = self._search_regex(r'vid\s*:\s*"(.+?)",', player_data, 'vid')
upload_date = self._search_regex(r'datetime="(.+?)"', webpage, 'upload_date', fatal=False)
video_path = self._search_regex(r'(/content/.*?).mp4', webpage, 'video_path')

formats = []

m3u8_url = '%s/local/hls/,%s.mp4.urlset/master.m3u8' % (
self._HOST, video_path)
m3u8_formats = self._extract_m3u8_formats(
m3u8_url, video_id, m3u8_id='hls')
formats.extend(m3u8_formats)

mpd_url = '%s/local/dash//,%s.mp4.urlset/manifest.mpd' % (
self._HOST, video_path)
formats.extend(self._extract_mpd_formats(
mpd_url, video_id, mpd_id='dash', fatal=False))

for q in filter(None, video_path.split(',')):
http_f = self._generate_mp4_url(q, m3u8_formats)
if http_f:
formats.append(http_f)

self._sort_formats(formats)

return {
'_type': 'url_transparent',
'url': smuggle_url('kaltura:103:%s' % vid, {
'service_url': 'http://nkdam.iltrovatore.it',
}),
'id': video_id,
'title': self._og_search_title(webpage, default=None),
'description': self._og_search_description(webpage, default=None),
'thumbnail': self._og_search_thumbnail(webpage, default=None),
'ie_key': 'Kaltura',
'formats': formats,
'upload_date': unified_strdate(upload_date)
}


Expand Down
4 changes: 4 additions & 0 deletions yt_dlp/options.py
Expand Up @@ -1546,6 +1546,10 @@ def _dict_from_options_callback(
'--youtube-skip-hls-manifest', '--no-youtube-include-hls-manifest',
action='store_false', dest='youtube_include_hls_manifest',
help=optparse.SUPPRESS_HELP)
extractor.add_option(
'--generate-filesize',
action='store_true', dest='generate_filesize', default=False,
help='Generates filesize_approx from Content-Length header for some video formats.')

parser.add_option_group(general)
parser.add_option_group(network)
Expand Down