-
Notifications
You must be signed in to change notification settings - Fork 9.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LinkedIn] ERROR: An extractor error has occurred. #31270
Comments
Do it? |
... The URL in OP should work without the query parameters and without providing login credentials; from what I gathered, it must be a free sample: (... it plays fine inside my browser 😄 );
Sadly, what is actually being downloaded is no media file, but a HTML one, the page's source code:
Inside the <div class="share-native-video">
<video class="share-native-video__node video-js" data-sources=
"[{"src":"https://dms.licdn.com/playlist/C4D0DAQGPfzHW4lxbTw/learning-original-video-vbr-540/0/1598782239071?e=1665342000&v=beta&t=dwvCYQxhSyDwwUmJAmm62GV9oPTuK-mQZ3DczI2u7_8#.mp4"}]"
data-poster-url=
"https://media-exp1.licdn.com/dms/image/C4E0DAQEIBfhUmf3mlw/learning-public-crop_675_1200/0/1567118425117?e=1665342000&v=beta&t=XO_op2UrLN64_KQbrSj9lOk7ZACr6p7GblyOuweD6mg"
data-captions-url=
"https://www.linkedin.com/ambry/?x-li-ambry-ep=AQIMn_ZSVB7fCQAAAYOaDcda-TEnQXD-2_tND91gB_iJ-8yCizcDJTGyq02N63-Q_ApFqSlVY2hn7z3zvzkmPud1C9eKTbQKpe-rU3AB6tiBK57Z4gjR05o7TXuEhZS9vF6zj7WA5667DYvm-J0qPwCyffsBnYOOQuZaQPFlrkGDGKHVtcx1mp451qGQMZaRvPuCLOCjHgkk05cpKbMFOeN7u2Fc7roDLg-R3sNIIPuH-onbSpuRVJ1mAXyeq53bNRcch5sWUvBrfUHXXhtegbt8ae-1usMuwy8NwJCAfExXAlM4r-yp_JIDE1IbfihQUckTrrWxRDrXMF1WDnIh6QdfiSqWD6X6EbgTiOQQSSCjP-qDft_jY2X4_epMn-3u4HcDYnpD87pO_v2-UlRFaufikXU1Q3QkdOY5A6oaKP5gYichUMAiXIR7USfwnVXXfqaRqdEL0SqMjX3jCO2RvtlEMx1jmLVzVHhdqB8qdNXoL0Qz09sFSwrR46M1"
data-digitalmedia-asset-urn="urn:li:lyndaVideo:(urn:li:lyndaCourse:761019,2815083)"
data-tracking-id="DJvYbPLNTWK7posByDamzw=="></video>
</div> contains direct links to the media file itself,
(tokenised with a defined lifespan) and to WebVTT subs:
I suppose getting this to work with "paid for content" will require additional steps...
(w/ and w/o |
There is a LinkedIn extractor, but only for www.linkedin.com, and it doesn't understand this page structure, which has good ld+json block apart from its invalid After giving it a good talking-to: $ python -m youtube_dl -j 'https://de.linkedin.com/learning/einfuhrung-in-die-softwarearchitektur-1-grundlagen-begriffe-und-ausgewahlte-tools/was-ist-softwarearchitektur' | jq '.'
{
"display_id": "was-ist-softwarearchitektur",
"extractor": "linkedin:learning",
"protocol": "https",
"description": "Grundlagenwissen für Softwarearchitekten",
"upload_date": "20190605",
"timestamp": 1559692800,
"formats": [
{
"protocol": "https",
"format": "vbr-540 - 540p",
"url": "https://dms.licdn.com/playlist/C4D0DAQGPfzHW4lxbTw/learning-original-video-vbr-540/0/1598782239071?e=1665367200&v=beta&t=acTi1Goh8ZA7q9v2RAqERacJAikDZVcNmDaNfz8NpGs#.mp4",
"http_headers": ...,
"height": 540,
"ext": "mp4",
"format_id": "vbr-540"
}
],
"episode_id": "was-ist-softwarearchitektur",
"series_id": "einfuhrung-in-die-softwarearchitektur-1-grundlagen-begriffe-und-ausgewahlte-tools",
"_filename": "Was ist Softwarearchitektur - Einführung in die Softwarearchitektur 1 - Grundlagen, Begriffe und ausgewählte Tools-C4D0DAQGPfzHW4lxbTw.mp4",
"uploader": "Hendrik Lösch",
"duration": 312,
"format_id": "vbr-540",
"height": 540,
"http_headers": ...,
"id": "C4D0DAQGPfzHW4lxbTw",
"subtitles": {
"de": [
{
"url": "https://www.linkedin.com/ambry/?x-li-ambry-ep=AQIg1xl8j15WUAAAAYObefkbEndlIBeOIr8KICAJLJfFJxEGcNImqIc5sryMAQFVT5UIdIYmp4sS2d7uzpT_2Pn6XApik8l-7zhiIwKm9rKaiGCi-XfRdpKnA9e_vfZNd4012ocdN-6wLmPE7sSeF_AJIw8QoGja6KR-cNgrdyYMRjbQPrQJymTtoL4BP7z_JY4eM8IQMvrjlMoDDRGRlRr7Rq4lbXwP1iPGBfZb7KbDKu3ft1hbWHBuz3tMgYwtYnmAzvOJT6LIxhvNdZQTPc4g92B6OF3pz7xlaxfCVS8mGCKhs7ZNdkDw6juQDDIEedxwUrq4-xCTrTI3sNbfa8lIiTISHgfBuxsinODnef1mPHBFDxiJk16p2fw9SA3iUg1pqavE8Uj_JIP4DWHxtN43WK7R_onPH8KAgnvNUr5PVZBi3nzaqJbFSC9uSRDO2VDNkEcCqD15wA8oVJaAbsw4T87XteGZL8x-_klxt37qvzY_vlnfEZDgx033",
"ext": "vtt"
}
]
},
"view_count": 5709,
"playlist": null,
"thumbnails": [
{
"url": "https://media-exp1.licdn.com/dms/image/C4E0DAQEIBfhUmf3mlw/learning-public-crop_675_1200/0/1567118425117?e=1665367200&v=beta&t=VXMXYnLrAzWBVEQj73zJXVZc5_KcpSMaAmrNi87NL_Y",
"id": "0"
}
],
"title": "Was ist Softwarearchitektur? - Einführung in die Softwarearchitektur 1: Grundlagen, Begriffe und ausgewählte Tools",
"url": "https://dms.licdn.com/playlist/C4D0DAQGPfzHW4lxbTw/learning-original-video-vbr-540/0/1598782239071?e=1665367200&v=beta&t=acTi1Goh8ZA7q9v2RAqERacJAikDZVcNmDaNfz8NpGs#.mp4",
"extractor_key": "LinkedInLearning",
"format": "vbr-540 - 540p",
...
}
$ |
This is the patch I used: --- old/youtube_dl/extractor/linkedin.py
+++ new/youtube_dl/extractor/linkedin.py
@@ -5,9 +5,14 @@ import re
from .common import InfoExtractor
from ..utils import (
+ extract_attributes,
ExtractorError,
float_or_none,
int_or_none,
+ ISO639Utils,
+ merge_dicts,
+ try_get,
+ url_or_none,
urlencode_postdata,
urljoin,
)
@@ -17,7 +22,7 @@ class LinkedInLearningBaseIE(InfoExtractor):
_NETRC_MACHINE = 'linkedin'
_LOGIN_URL = 'https://www.linkedin.com/uas/login?trk=learning'
- def _call_api(self, course_slug, fields, video_slug=None, resolution=None):
+ def _call_api(self, host, course_slug, fields, video_slug=None, resolution=None):
query = {
'courseSlug': course_slug,
'fields': fields,
@@ -27,10 +32,10 @@ class LinkedInLearningBaseIE(InfoExtractor):
if video_slug:
query.update({
'videoSlug': video_slug,
- 'resolution': '_%s' % resolution,
+ 'resolution': '_%s' % (resolution, ),
})
- sub = ' %dp' % resolution
- api_url = 'https://www.linkedin.com/learning-api/detailedCourses'
+ sub = ' %dp' % (resolution, )
+ api_url = 'https://%s.linkedin.com/learning-api/detailedCourses' % (host, )
return self._download_json(
api_url, video_slug, 'Downloading%s JSON metadata' % sub, headers={
'Csrf-Token': self._get_cookies(api_url)['JSESSIONID'].value,
@@ -73,7 +78,7 @@ class LinkedInLearningBaseIE(InfoExtractor):
class LinkedInLearningIE(LinkedInLearningBaseIE):
IE_NAME = 'linkedin:learning'
- _VALID_URL = r'https?://(?:www\.)?linkedin\.com/learning/(?P<course_slug>[^/]+)/(?P<id>[^/?#]+)'
+ _VALID_URL = r'https?://(?P<host>www|[a-z]{2})?(?(host)\.)linkedin\.com/learning/(?P<course_slug>[^/]+)/(?P<id>[^/?#]+)'
_TEST = {
'url': 'https://www.linkedin.com/learning/programming-foundations-fundamentals/welcome?autoplay=true',
'md5': 'a1d74422ff0d5e66a792deb996693167',
@@ -86,15 +91,53 @@ class LinkedInLearningIE(LinkedInLearningBaseIE):
},
}
+ def _extract_free(self, url, host, course_slug, video_slug):
+ webpage = self._download_webpage(url, video_slug)
+ info = self._search_json_ld(webpage, video_slug, expected_type='VideoObject', default={})
+ title = info['title']
+ info.pop('url', None)
+ native_video = self._search_regex(
+ r'''(<video\b[^>]+\bclass\s*=\s*(["'])(?:(?:(?!\2).)+?\s)?share-native-video__node video-js\2[^>]*>)''',
+ webpage, 'native video', default='')
+ native_video = extract_attributes(native_video)
+ sources = self._parse_json(native_video.get('data-sources', '[]'), video_slug)
+ formats = []
+ video_id = video_slug
+ for src in sources:
+ src = url_or_none(try_get(src, lambda x: x['src']))
+ if not src:
+ continue
+ format_id = self._search_regex(r'-([avt]br-\d+)/', src, 'format id', default=None)
+ video_id = self._search_regex(r'/playlist/([^/]+)/', src, 'video id', default=video_id)
+ ext = self._search_regex(r'#\.(\w+)$', src, 'ext', default=None)
+ formats.append({
+ 'url': src,
+ 'format_id': format_id,
+ 'ext': ext,
+ 'height': int_or_none(format_id.split('-')[-1]),
+ })
+ self._sort_formats(formats)
+ sttl_url = native_video.get('data-captions-url') if ISO639Utils.short2long(host) else None
+ return merge_dicts({
+ 'id': video_id,
+ 'display_id': video_slug,
+ 'episode_id': video_slug,
+ 'series_id': course_slug,
+ 'formats': formats,
+ 'subtitles': sttl_url and {host: [{'url': sttl_url, 'ext': 'vtt', }]},
+ }, info)
+
def _real_extract(self, url):
- course_slug, video_slug = re.match(self._VALID_URL, url).groups()
+ host, course_slug, video_slug = re.match(self._VALID_URL, url).groups()
video_data = None
formats = []
for width, height in ((640, 360), (960, 540), (1280, 720)):
- video_data = self._call_api(
- course_slug, 'selectedVideo', video_slug, height)['selectedVideo']
-
+ try:
+ video_data = self._call_api(
+ host or 'www', course_slug, 'selectedVideo', video_slug, height)['selectedVideo']
+ except (ExtractorError, KeyError):
+ return self._extract_free(url, host, course_slug, video_slug)
video_url_data = video_data.get('url') or {}
progressive_url = video_url_data.get('progressiveUrl')
if progressive_url:
@@ -155,7 +198,7 @@ class LinkedInLearningCourseIE(LinkedInLearningBaseIE):
def _real_extract(self, url):
course_slug = self._match_id(url)
- course_data = self._call_api(course_slug, 'chapters,description,title')
+ course_data = self._call_api('www', course_slug, 'chapters,description,title')
entries = []
for chapter_number, chapter in enumerate(course_data.get('chapters', []), 1): |
Checklist
Verbose log
Description
WRITE DESCRIPTION HERE
The text was updated successfully, but these errors were encountered: