From 3675f9f76fcda783b83e835b081dec3be9e9affb Mon Sep 17 00:00:00 2001 From: Martin Renold Date: Sat, 9 Dec 2023 17:32:58 +0100 Subject: [PATCH 01/14] [ie/mx3] Add extractor --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/mx3.py | 57 +++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 yt_dlp/extractor/mx3.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 62103f13c14..6ffc8d0319b 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1124,6 +1124,7 @@ MusicdexArtistIE, MusicdexPlaylistIE, ) +from .mx3 import Mx3IE from .mxplayer import ( MxplayerIE, MxplayerShowIE, diff --git a/yt_dlp/extractor/mx3.py b/yt_dlp/extractor/mx3.py new file mode 100644 index 00000000000..800a6f6b150 --- /dev/null +++ b/yt_dlp/extractor/mx3.py @@ -0,0 +1,57 @@ +import re + +from .common import InfoExtractor + + +class Mx3IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mx3\.ch/t/(?P[0-9A-Za-z]+)' + _TESTS = [{ + 'url': 'https://mx3.ch/t/1Cru', + 'md5': '4aa5e93c3a2da01048e22d7851dc0a70', + 'info_dict': { + 'id': '1Cru', + # This one is audio-only. Looks like an ordinary mp3. + 'ext': 'mp3', + 'artist': 'Tortue Tortue', + 'genre': 'Rock', + 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/4643/square_large/1-s-envoler-1.jpg?1630272813', + 'title': 'Tortue Tortue - S\'envoler', + } + }, { + 'url': 'https://mx3.ch/t/1LIY', + 'md5': '87c856be272aa614febb9455aecb5833', + 'info_dict': { + 'id': '1LIY', + # This is a music video. 'file' says: ISO Media, MP4 Base Media v1 [ISO 14496-12:2003] + 'ext': 'mp4', + 'artist': 'The Broots', + 'genre': 'Electro', + 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0110/0003/video_large/frame_0000.png?1686963670', + 'title': 'The Broots - The Broots-Larytta remix "Begging For Help"', + } + }] + + def _real_extract(self, url): + track_id = self._match_id(url) + webpage = self._download_webpage(url, track_id) + + title = self._og_search_title(webpage) + genre = self._html_search_regex(r']+class="single-band-genre"[^>]*>([^<]+)', + webpage, 'genre', fatal=False, flags=re.DOTALL) + thumbnail = self._og_search_thumbnail(webpage) + is_video = 'central-player-placeholder video' in webpage or '/video' in thumbnail + + if ' - ' in title: + artist, _track = title.split(' - ', maxsplit=1) + else: + artist = None + + return { + 'id': track_id, + 'url': f'https://mx3.ch/tracks/{track_id}/player_asset', + 'ext': 'mp4' if is_video else 'mp3', + 'title': title, + 'artist': artist, + 'genre': genre, + 'thumbnail': thumbnail, + } From 6c969641c2d99ac386bd0af82d39b2d214cc799e Mon Sep 17 00:00:00 2001 From: Martin Renold Date: Tue, 26 Dec 2023 12:05:06 +0100 Subject: [PATCH 02/14] [ie/mx3] Use JSON where possible --- yt_dlp/extractor/mx3.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/yt_dlp/extractor/mx3.py b/yt_dlp/extractor/mx3.py index 800a6f6b150..f3d45458923 100644 --- a/yt_dlp/extractor/mx3.py +++ b/yt_dlp/extractor/mx3.py @@ -14,7 +14,7 @@ class Mx3IE(InfoExtractor): 'ext': 'mp3', 'artist': 'Tortue Tortue', 'genre': 'Rock', - 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/4643/square_large/1-s-envoler-1.jpg?1630272813', + 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/4643/square_xlarge/1-s-envoler-1.jpg?1630272813', 'title': 'Tortue Tortue - S\'envoler', } }, { @@ -26,25 +26,24 @@ class Mx3IE(InfoExtractor): 'ext': 'mp4', 'artist': 'The Broots', 'genre': 'Electro', - 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0110/0003/video_large/frame_0000.png?1686963670', - 'title': 'The Broots - The Broots-Larytta remix "Begging For Help"', + 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0110/0003/video_xlarge/frame_0000.png?1686963670', + 'title': 'The Broots-Larytta remix "Begging For Help"', } }] def _real_extract(self, url): track_id = self._match_id(url) webpage = self._download_webpage(url, track_id) + json = self._download_json(f'https://mx3.ch/t/{track_id}.json', track_id) + + title = json['title'] + artist = json.get('artist') + if artist and not title.startswith(artist): + title = artist + ' - ' + title - title = self._og_search_title(webpage) genre = self._html_search_regex(r']+class="single-band-genre"[^>]*>([^<]+)', webpage, 'genre', fatal=False, flags=re.DOTALL) - thumbnail = self._og_search_thumbnail(webpage) - is_video = 'central-player-placeholder video' in webpage or '/video' in thumbnail - - if ' - ' in title: - artist, _track = title.split(' - ', maxsplit=1) - else: - artist = None + is_video = json.get('type') == 'video' return { 'id': track_id, @@ -53,5 +52,5 @@ def _real_extract(self, url): 'title': title, 'artist': artist, 'genre': genre, - 'thumbnail': thumbnail, + 'thumbnail': json.get('picture_url_xlarge') or json.get('picture_url'), } From e93c02ab90a69d505f11bc29613648b7259886e1 Mon Sep 17 00:00:00 2001 From: Martin Renold Date: Tue, 26 Dec 2023 16:35:55 +0100 Subject: [PATCH 03/14] [ie/mx3] Support multiple formats, don't hardcode extension guess --- yt_dlp/extractor/mx3.py | 54 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/mx3.py b/yt_dlp/extractor/mx3.py index f3d45458923..b6643bf9749 100644 --- a/yt_dlp/extractor/mx3.py +++ b/yt_dlp/extractor/mx3.py @@ -1,16 +1,18 @@ import re from .common import InfoExtractor +from ..utils import urlhandle_detect_ext +from ..networking import HEADRequest class Mx3IE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?mx3\.ch/t/(?P[0-9A-Za-z]+)' _TESTS = [{ 'url': 'https://mx3.ch/t/1Cru', - 'md5': '4aa5e93c3a2da01048e22d7851dc0a70', + 'md5': '82510bf4c21f17da41bff7e1ffd84e78', 'info_dict': { 'id': '1Cru', - # This one is audio-only. Looks like an ordinary mp3. + # This one is audio-only. It's a mp3, but we have to make a HEAD request to find out. 'ext': 'mp3', 'artist': 'Tortue Tortue', 'genre': 'Rock', @@ -19,7 +21,7 @@ class Mx3IE(InfoExtractor): } }, { 'url': 'https://mx3.ch/t/1LIY', - 'md5': '87c856be272aa614febb9455aecb5833', + 'md5': '4117489dff8c763ecfbb0b95a67d6c8e', 'info_dict': { 'id': '1LIY', # This is a music video. 'file' says: ISO Media, MP4 Base Media v1 [ISO 14496-12:2003] @@ -29,6 +31,18 @@ class Mx3IE(InfoExtractor): 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0110/0003/video_xlarge/frame_0000.png?1686963670', 'title': 'The Broots-Larytta remix "Begging For Help"', } + }, { + 'url': 'https://mx3.ch/t/1C6E', + 'md5': '1afcd578493ddb8e5008e94bb6d97e25', + 'info_dict': { + 'id': '1C6E', + # This one has a download button, yielding a WAV. + 'ext': 'wav', + 'artist': 'Alien Bubblegum', + 'genre': 'Punk', + 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/1551/square_xlarge/pandora-s-box-cover-with-title.png?1627054733', + 'title': 'Alien Bubblegum - Wide Awake', + } }] def _real_extract(self, url): @@ -43,12 +57,40 @@ def _real_extract(self, url): genre = self._html_search_regex(r']+class="single-band-genre"[^>]*>([^<]+)', webpage, 'genre', fatal=False, flags=re.DOTALL) - is_video = json.get('type') == 'video' + + formats = [] + + def add_format(fmt): + urlh = self._request_webpage(HEADRequest(fmt['url']), track_id, note='Fetching media headers', fatal=False) + if urlh: + fmt['ext'] = urlhandle_detect_ext(urlh) + formats.append(fmt) + + add_format({ + 'url': 'https://mx3.ch/' + json['url'], + 'format_id': 'default', + 'quality': 1, + }) + + if 'hd_url' in json: + add_format({ + 'url': 'https://mx3.ch/' + json['hd_url'], + 'format_id': 'hd', + 'quality': 10, + }) + + # the "download" feature is not available everywhere + if f'/tracks/{track_id}/download' in webpage: + add_format({ + 'url': f'https://mx3.ch/tracks/{track_id}/download', + 'format_id': 'download', + 'quality': 11, + 'format_note': 'usually uncompressed WAV', + }) return { 'id': track_id, - 'url': f'https://mx3.ch/tracks/{track_id}/player_asset', - 'ext': 'mp4' if is_video else 'mp3', + 'formats': formats, 'title': title, 'artist': artist, 'genre': genre, From fb53e7197f1af0cb0c19025f9f1d69e90a4ac76b Mon Sep 17 00:00:00 2001 From: Martin Renold Date: Sun, 14 Jan 2024 19:42:51 +0100 Subject: [PATCH 04/14] [ie/mx3] Support for neo.mx3.ch and volksmusik.mx3.ch The track IDs on neo.mx3.ch and volksmusik.mx3.ch do not work on mx3.ch. The sites even require users to create a separate login. And also extract "composer" and "performer". --- yt_dlp/extractor/_extractors.py | 6 ++- yt_dlp/extractor/mx3.py | 82 +++++++++++++++++++++++++++------ 2 files changed, 72 insertions(+), 16 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 6ffc8d0319b..c0f6b32b697 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1124,7 +1124,11 @@ MusicdexArtistIE, MusicdexPlaylistIE, ) -from .mx3 import Mx3IE +from .mx3 import ( + Mx3IE, + Mx3NeoIE, + Mx3VolksmusikIE, +) from .mxplayer import ( MxplayerIE, MxplayerShowIE, diff --git a/yt_dlp/extractor/mx3.py b/yt_dlp/extractor/mx3.py index b6643bf9749..3c4d05e092a 100644 --- a/yt_dlp/extractor/mx3.py +++ b/yt_dlp/extractor/mx3.py @@ -6,6 +6,7 @@ class Mx3IE(InfoExtractor): + _MX3_DOMAIN = 'mx3.ch' _VALID_URL = r'https?://(?:www\.)?mx3\.ch/t/(?P[0-9A-Za-z]+)' _TESTS = [{ 'url': 'https://mx3.ch/t/1Cru', @@ -14,10 +15,11 @@ class Mx3IE(InfoExtractor): 'id': '1Cru', # This one is audio-only. It's a mp3, but we have to make a HEAD request to find out. 'ext': 'mp3', - 'artist': 'Tortue Tortue', - 'genre': 'Rock', + 'artist': ['Tortue Tortue', 'Godina'], + 'composer': 'Olivier Godinat', + 'genre': ['Rock'], 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/4643/square_xlarge/1-s-envoler-1.jpg?1630272813', - 'title': 'Tortue Tortue - S\'envoler', + 'title': 'Tortue Tortue, Godina - S\'envoler', } }, { 'url': 'https://mx3.ch/t/1LIY', @@ -26,8 +28,9 @@ class Mx3IE(InfoExtractor): 'id': '1LIY', # This is a music video. 'file' says: ISO Media, MP4 Base Media v1 [ISO 14496-12:2003] 'ext': 'mp4', - 'artist': 'The Broots', - 'genre': 'Electro', + 'artist': ['The Broots', 'Tania Kimfumu'], + 'composer': 'Emmanuel Diserens', + 'genre': ['Electro'], 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0110/0003/video_xlarge/frame_0000.png?1686963670', 'title': 'The Broots-Larytta remix "Begging For Help"', } @@ -38,8 +41,9 @@ class Mx3IE(InfoExtractor): 'id': '1C6E', # This one has a download button, yielding a WAV. 'ext': 'wav', - 'artist': 'Alien Bubblegum', - 'genre': 'Punk', + 'artist': ['Alien Bubblegum'], + 'composer': 'Alien Bubblegum', + 'genre': ['Punk'], 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/1551/square_xlarge/pandora-s-box-cover-with-title.png?1627054733', 'title': 'Alien Bubblegum - Wide Awake', } @@ -48,15 +52,25 @@ class Mx3IE(InfoExtractor): def _real_extract(self, url): track_id = self._match_id(url) webpage = self._download_webpage(url, track_id) - json = self._download_json(f'https://mx3.ch/t/{track_id}.json', track_id) + json = self._download_json(f'https://{self._MX3_DOMAIN}/t/{track_id}.json', track_id) title = json['title'] - artist = json.get('artist') - if artist and not title.startswith(artist): - title = artist + ' - ' + title + + artists = [] + if json.get('artist'): + artists.append(json['artist']) + performer = json.get('performer_name') + if performer and performer not in artists: + artists.append(performer) + + if artists and not title.startswith(artists[0]): + title = ', '.join(artists) + ' - ' + title genre = self._html_search_regex(r']+class="single-band-genre"[^>]*>([^<]+)', webpage, 'genre', fatal=False, flags=re.DOTALL) + if genre: + assert isinstance(genre, str) + genre = [s.strip() for s in genre.split(',')] formats = [] @@ -66,15 +80,16 @@ def add_format(fmt): fmt['ext'] = urlhandle_detect_ext(urlh) formats.append(fmt) + base_url = 'https://' + self._MX3_DOMAIN + '/' add_format({ - 'url': 'https://mx3.ch/' + json['url'], + 'url': base_url + json["url"], 'format_id': 'default', 'quality': 1, }) if 'hd_url' in json: add_format({ - 'url': 'https://mx3.ch/' + json['hd_url'], + 'url': base_url + json['hd_url'], 'format_id': 'hd', 'quality': 10, }) @@ -82,7 +97,7 @@ def add_format(fmt): # the "download" feature is not available everywhere if f'/tracks/{track_id}/download' in webpage: add_format({ - 'url': f'https://mx3.ch/tracks/{track_id}/download', + 'url': f'{base_url}tracks/{track_id}/download', 'format_id': 'download', 'quality': 11, 'format_note': 'usually uncompressed WAV', @@ -92,7 +107,44 @@ def add_format(fmt): 'id': track_id, 'formats': formats, 'title': title, - 'artist': artist, + 'artist': artists, + 'composer': json.get('composer_name', None), 'genre': genre, 'thumbnail': json.get('picture_url_xlarge') or json.get('picture_url'), } + + +class Mx3NeoIE(Mx3IE): + _MX3_DOMAIN = 'neo.mx3.ch' + _VALID_URL = r'https?://(?:www\.)?neo.mx3\.ch/t/(?P[0-9A-Za-z]+)' + _TESTS = [{ + 'url': 'https://neo.mx3.ch/t/1hpd', + 'md5': 'ff0b2b91ce0b8931c0a358715758dc78', + 'info_dict': { + 'id': '1hpd', + 'ext': 'mp3', + 'artist': ['Kammerorchester Basel', 'Baptiste Lopez'], + 'composer': 'Jannik Giger', + 'genre': ['Composition', 'Orchestra'], + 'title': 'Kammerorchester Basel, Baptiste Lopez - Troisième œil. Für Kammerorchester (2023)', + 'thumbnail': 'https://neo.mx3.ch/pictures/neo/file/0000/0241/square_xlarge/kammerorchester-basel-group-photo-2_c_-lukasz-rajchert.jpg?1560341252' + } + }] + + +class Mx3VolksmusikIE(Mx3IE): + _MX3_DOMAIN = 'volksmusik.mx3.ch' + _VALID_URL = r'https?://(?:www\.)?volksmusik.mx3\.ch/t/(?P[0-9A-Za-z]+)' + _TESTS = [{ + 'url': 'https://volksmusik.mx3.ch/t/Zx', + 'md5': 'dd967a7b0c1ef898f3e072cf9c2eae3c', + 'info_dict': { + 'id': 'Zx', + 'ext': 'mp3', + 'artist': ['Ländlerkapelle GrischArt'], + 'composer': 'Urs Glauser', + 'genre': ['Instrumental', 'Graubünden'], + 'title': 'Ländlerkapelle GrischArt - Chämilouf', + 'thumbnail': 'https://volksmusik.mx3.ch/pictures/vxm/file/0000/3815/square_xlarge/grischart1.jpg?1450530120', + } + }] From ce38da900bab8150de684c5061b3016d8f7b0c77 Mon Sep 17 00:00:00 2001 From: Martin Renold Date: Sun, 14 Jan 2024 20:44:02 +0100 Subject: [PATCH 05/14] [ie/mx3] Do not prepend artist to title, do not output lists Some other extractors use lists too, but it doesn't work well with filename templates. --- yt_dlp/extractor/mx3.py | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/yt_dlp/extractor/mx3.py b/yt_dlp/extractor/mx3.py index 3c4d05e092a..ffe6e75efbe 100644 --- a/yt_dlp/extractor/mx3.py +++ b/yt_dlp/extractor/mx3.py @@ -15,11 +15,11 @@ class Mx3IE(InfoExtractor): 'id': '1Cru', # This one is audio-only. It's a mp3, but we have to make a HEAD request to find out. 'ext': 'mp3', - 'artist': ['Tortue Tortue', 'Godina'], + 'artist': 'Tortue Tortue, Godina', 'composer': 'Olivier Godinat', - 'genre': ['Rock'], + 'genre': 'Rock', 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/4643/square_xlarge/1-s-envoler-1.jpg?1630272813', - 'title': 'Tortue Tortue, Godina - S\'envoler', + 'title': 'S\'envoler', } }, { 'url': 'https://mx3.ch/t/1LIY', @@ -28,9 +28,9 @@ class Mx3IE(InfoExtractor): 'id': '1LIY', # This is a music video. 'file' says: ISO Media, MP4 Base Media v1 [ISO 14496-12:2003] 'ext': 'mp4', - 'artist': ['The Broots', 'Tania Kimfumu'], + 'artist': 'The Broots, Tania Kimfumu', 'composer': 'Emmanuel Diserens', - 'genre': ['Electro'], + 'genre': 'Electro', 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0110/0003/video_xlarge/frame_0000.png?1686963670', 'title': 'The Broots-Larytta remix "Begging For Help"', } @@ -41,11 +41,11 @@ class Mx3IE(InfoExtractor): 'id': '1C6E', # This one has a download button, yielding a WAV. 'ext': 'wav', - 'artist': ['Alien Bubblegum'], + 'artist': 'Alien Bubblegum', 'composer': 'Alien Bubblegum', - 'genre': ['Punk'], + 'genre': 'Punk', 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/1551/square_xlarge/pandora-s-box-cover-with-title.png?1627054733', - 'title': 'Alien Bubblegum - Wide Awake', + 'title': 'Wide Awake', } }] @@ -54,8 +54,6 @@ def _real_extract(self, url): webpage = self._download_webpage(url, track_id) json = self._download_json(f'https://{self._MX3_DOMAIN}/t/{track_id}.json', track_id) - title = json['title'] - artists = [] if json.get('artist'): artists.append(json['artist']) @@ -63,14 +61,10 @@ def _real_extract(self, url): if performer and performer not in artists: artists.append(performer) - if artists and not title.startswith(artists[0]): - title = ', '.join(artists) + ' - ' + title + title = json['title'] genre = self._html_search_regex(r']+class="single-band-genre"[^>]*>([^<]+)', webpage, 'genre', fatal=False, flags=re.DOTALL) - if genre: - assert isinstance(genre, str) - genre = [s.strip() for s in genre.split(',')] formats = [] @@ -107,7 +101,7 @@ def add_format(fmt): 'id': track_id, 'formats': formats, 'title': title, - 'artist': artists, + 'artist': ', '.join(artists), 'composer': json.get('composer_name', None), 'genre': genre, 'thumbnail': json.get('picture_url_xlarge') or json.get('picture_url'), @@ -123,10 +117,10 @@ class Mx3NeoIE(Mx3IE): 'info_dict': { 'id': '1hpd', 'ext': 'mp3', - 'artist': ['Kammerorchester Basel', 'Baptiste Lopez'], + 'artist': 'Kammerorchester Basel, Baptiste Lopez', 'composer': 'Jannik Giger', - 'genre': ['Composition', 'Orchestra'], - 'title': 'Kammerorchester Basel, Baptiste Lopez - Troisième œil. Für Kammerorchester (2023)', + 'genre': 'Composition, Orchestra', + 'title': 'Troisième œil. Für Kammerorchester (2023)', 'thumbnail': 'https://neo.mx3.ch/pictures/neo/file/0000/0241/square_xlarge/kammerorchester-basel-group-photo-2_c_-lukasz-rajchert.jpg?1560341252' } }] @@ -141,10 +135,10 @@ class Mx3VolksmusikIE(Mx3IE): 'info_dict': { 'id': 'Zx', 'ext': 'mp3', - 'artist': ['Ländlerkapelle GrischArt'], + 'artist': 'Ländlerkapelle GrischArt', 'composer': 'Urs Glauser', - 'genre': ['Instrumental', 'Graubünden'], - 'title': 'Ländlerkapelle GrischArt - Chämilouf', + 'genre': 'Instrumental, Graubünden', + 'title': 'Chämilouf', 'thumbnail': 'https://volksmusik.mx3.ch/pictures/vxm/file/0000/3815/square_xlarge/grischart1.jpg?1450530120', } }] From 55d4944b9457d414c82936db5bdfc54950503e3c Mon Sep 17 00:00:00 2001 From: Martin Renold Date: Fri, 19 Jan 2024 18:11:52 +0100 Subject: [PATCH 06/14] [ie/mx3] Refactor: abstract base class --- yt_dlp/extractor/mx3.py | 94 +++++++++++++++++++++-------------------- 1 file changed, 49 insertions(+), 45 deletions(-) diff --git a/yt_dlp/extractor/mx3.py b/yt_dlp/extractor/mx3.py index ffe6e75efbe..f61038643b0 100644 --- a/yt_dlp/extractor/mx3.py +++ b/yt_dlp/extractor/mx3.py @@ -5,49 +5,8 @@ from ..networking import HEADRequest -class Mx3IE(InfoExtractor): - _MX3_DOMAIN = 'mx3.ch' - _VALID_URL = r'https?://(?:www\.)?mx3\.ch/t/(?P[0-9A-Za-z]+)' - _TESTS = [{ - 'url': 'https://mx3.ch/t/1Cru', - 'md5': '82510bf4c21f17da41bff7e1ffd84e78', - 'info_dict': { - 'id': '1Cru', - # This one is audio-only. It's a mp3, but we have to make a HEAD request to find out. - 'ext': 'mp3', - 'artist': 'Tortue Tortue, Godina', - 'composer': 'Olivier Godinat', - 'genre': 'Rock', - 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/4643/square_xlarge/1-s-envoler-1.jpg?1630272813', - 'title': 'S\'envoler', - } - }, { - 'url': 'https://mx3.ch/t/1LIY', - 'md5': '4117489dff8c763ecfbb0b95a67d6c8e', - 'info_dict': { - 'id': '1LIY', - # This is a music video. 'file' says: ISO Media, MP4 Base Media v1 [ISO 14496-12:2003] - 'ext': 'mp4', - 'artist': 'The Broots, Tania Kimfumu', - 'composer': 'Emmanuel Diserens', - 'genre': 'Electro', - 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0110/0003/video_xlarge/frame_0000.png?1686963670', - 'title': 'The Broots-Larytta remix "Begging For Help"', - } - }, { - 'url': 'https://mx3.ch/t/1C6E', - 'md5': '1afcd578493ddb8e5008e94bb6d97e25', - 'info_dict': { - 'id': '1C6E', - # This one has a download button, yielding a WAV. - 'ext': 'wav', - 'artist': 'Alien Bubblegum', - 'composer': 'Alien Bubblegum', - 'genre': 'Punk', - 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/1551/square_xlarge/pandora-s-box-cover-with-title.png?1627054733', - 'title': 'Wide Awake', - } - }] +class Mx3BaseIE(InfoExtractor): + _MX3_DOMAIN = None def _real_extract(self, url): track_id = self._match_id(url) @@ -108,7 +67,52 @@ def add_format(fmt): } -class Mx3NeoIE(Mx3IE): +class Mx3IE(Mx3BaseIE): + _MX3_DOMAIN = 'mx3.ch' + _VALID_URL = r'https?://(?:www\.)?mx3\.ch/t/(?P[0-9A-Za-z]+)' + _TESTS = [{ + 'url': 'https://mx3.ch/t/1Cru', + 'md5': '82510bf4c21f17da41bff7e1ffd84e78', + 'info_dict': { + 'id': '1Cru', + # This one is audio-only. It's a mp3, but we have to make a HEAD request to find out. + 'ext': 'mp3', + 'artist': 'Tortue Tortue, Godina', + 'composer': 'Olivier Godinat', + 'genre': 'Rock', + 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/4643/square_xlarge/1-s-envoler-1.jpg?1630272813', + 'title': 'S\'envoler', + } + }, { + 'url': 'https://mx3.ch/t/1LIY', + 'md5': '4117489dff8c763ecfbb0b95a67d6c8e', + 'info_dict': { + 'id': '1LIY', + # This is a music video. 'file' says: ISO Media, MP4 Base Media v1 [ISO 14496-12:2003] + 'ext': 'mp4', + 'artist': 'The Broots, Tania Kimfumu', + 'composer': 'Emmanuel Diserens', + 'genre': 'Electro', + 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0110/0003/video_xlarge/frame_0000.png?1686963670', + 'title': 'The Broots-Larytta remix "Begging For Help"', + } + }, { + 'url': 'https://mx3.ch/t/1C6E', + 'md5': '1afcd578493ddb8e5008e94bb6d97e25', + 'info_dict': { + 'id': '1C6E', + # This one has a download button, yielding a WAV. + 'ext': 'wav', + 'artist': 'Alien Bubblegum', + 'composer': 'Alien Bubblegum', + 'genre': 'Punk', + 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/1551/square_xlarge/pandora-s-box-cover-with-title.png?1627054733', + 'title': 'Wide Awake', + } + }] + + +class Mx3NeoIE(Mx3BaseIE): _MX3_DOMAIN = 'neo.mx3.ch' _VALID_URL = r'https?://(?:www\.)?neo.mx3\.ch/t/(?P[0-9A-Za-z]+)' _TESTS = [{ @@ -126,7 +130,7 @@ class Mx3NeoIE(Mx3IE): }] -class Mx3VolksmusikIE(Mx3IE): +class Mx3VolksmusikIE(Mx3BaseIE): _MX3_DOMAIN = 'volksmusik.mx3.ch' _VALID_URL = r'https?://(?:www\.)?volksmusik.mx3\.ch/t/(?P[0-9A-Za-z]+)' _TESTS = [{ From 8f43d974200669a5d9a582f0aa85633395552821 Mon Sep 17 00:00:00 2001 From: Martin Renold Date: Fri, 19 Jan 2024 20:45:09 +0100 Subject: [PATCH 07/14] [ie/mx3] Always try HEAD on media URLs; extract size and timestamp --- yt_dlp/extractor/mx3.py | 70 ++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 29 deletions(-) diff --git a/yt_dlp/extractor/mx3.py b/yt_dlp/extractor/mx3.py index f61038643b0..bf5cbaa0ee5 100644 --- a/yt_dlp/extractor/mx3.py +++ b/yt_dlp/extractor/mx3.py @@ -1,7 +1,13 @@ import re from .common import InfoExtractor -from ..utils import urlhandle_detect_ext +from ..utils.traversal import traverse_obj +from ..utils import ( + urlhandle_detect_ext, + url_or_none, + int_or_none, + unified_timestamp, +) from ..networking import HEADRequest @@ -20,50 +26,51 @@ def _real_extract(self, url): if performer and performer not in artists: artists.append(performer) - title = json['title'] - genre = self._html_search_regex(r']+class="single-band-genre"[^>]*>([^<]+)', webpage, 'genre', fatal=False, flags=re.DOTALL) formats = [] - def add_format(fmt): - urlh = self._request_webpage(HEADRequest(fmt['url']), track_id, note='Fetching media headers', fatal=False) - if urlh: + def add_format(fmt, fatal): + if fatal: + urlh = self._request_webpage(HEADRequest(fmt['url']), track_id, note='Fetching default media headers') + else: + urlh = self._request_webpage(HEADRequest(fmt['url']), track_id, fatal=False, expected_status=404, + note=f'Trying media headers for optional format {fmt["format_id"]}') + if urlh and urlh.status == 200: fmt['ext'] = urlhandle_detect_ext(urlh) + fmt['filesize'] = int_or_none(urlh.headers.get('Content-Length')) + fmt['timestamp'] = unified_timestamp(urlh.headers.get('Last-Modified')) formats.append(fmt) - base_url = 'https://' + self._MX3_DOMAIN + '/' + track_url = f'https://{self._MX3_DOMAIN}/tracks/{track_id}' add_format({ - 'url': base_url + json["url"], + 'url': f'{track_url}/player_asset', 'format_id': 'default', 'quality': 1, - }) - - if 'hd_url' in json: - add_format({ - 'url': base_url + json['hd_url'], - 'format_id': 'hd', - 'quality': 10, - }) - - # the "download" feature is not available everywhere - if f'/tracks/{track_id}/download' in webpage: - add_format({ - 'url': f'{base_url}tracks/{track_id}/download', - 'format_id': 'download', - 'quality': 11, - 'format_note': 'usually uncompressed WAV', - }) + }, fatal=True) + # the formats below don't always exist + add_format({ + 'url': f'{track_url}/player_asset?quality=hd', + 'format_id': 'hd', + 'quality': 10, + }, fatal=False) + add_format({ + 'url': f'{track_url}/download', + 'format_id': 'download', + 'quality': 11, + }, fatal=False) return { 'id': track_id, 'formats': formats, - 'title': title, 'artist': ', '.join(artists), - 'composer': json.get('composer_name', None), 'genre': genre, - 'thumbnail': json.get('picture_url_xlarge') or json.get('picture_url'), + **traverse_obj(json, { + 'title': ('title', {str}), + 'composer': ('composer_name', {str}), + 'thumbnail': (('picture_url_xlarge', 'picture_url'), {url_or_none}), + }, get_all=False), } @@ -82,6 +89,7 @@ class Mx3IE(Mx3BaseIE): 'genre': 'Rock', 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/4643/square_xlarge/1-s-envoler-1.jpg?1630272813', 'title': 'S\'envoler', + 'timestamp': 1630272831, } }, { 'url': 'https://mx3.ch/t/1LIY', @@ -95,6 +103,7 @@ class Mx3IE(Mx3BaseIE): 'genre': 'Electro', 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0110/0003/video_xlarge/frame_0000.png?1686963670', 'title': 'The Broots-Larytta remix "Begging For Help"', + 'timestamp': 1686963636, } }, { 'url': 'https://mx3.ch/t/1C6E', @@ -108,6 +117,7 @@ class Mx3IE(Mx3BaseIE): 'genre': 'Punk', 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/1551/square_xlarge/pandora-s-box-cover-with-title.png?1627054733', 'title': 'Wide Awake', + 'timestamp': 1627054732, } }] @@ -125,7 +135,8 @@ class Mx3NeoIE(Mx3BaseIE): 'composer': 'Jannik Giger', 'genre': 'Composition, Orchestra', 'title': 'Troisième œil. Für Kammerorchester (2023)', - 'thumbnail': 'https://neo.mx3.ch/pictures/neo/file/0000/0241/square_xlarge/kammerorchester-basel-group-photo-2_c_-lukasz-rajchert.jpg?1560341252' + 'thumbnail': 'https://neo.mx3.ch/pictures/neo/file/0000/0241/square_xlarge/kammerorchester-basel-group-photo-2_c_-lukasz-rajchert.jpg?1560341252', + 'timestamp': 1705055012, } }] @@ -144,5 +155,6 @@ class Mx3VolksmusikIE(Mx3BaseIE): 'genre': 'Instrumental, Graubünden', 'title': 'Chämilouf', 'thumbnail': 'https://volksmusik.mx3.ch/pictures/vxm/file/0000/3815/square_xlarge/grischart1.jpg?1450530120', + 'timestamp': 1450532809, } }] From ad2893a559afc807b34cd6e21931a7f3cd98c5df Mon Sep 17 00:00:00 2001 From: Martin Renold Date: Sat, 20 Jan 2024 10:39:18 +0100 Subject: [PATCH 08/14] [ie/mx3] Refactoring and fixes from review --- yt_dlp/extractor/mx3.py | 56 +++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 33 deletions(-) diff --git a/yt_dlp/extractor/mx3.py b/yt_dlp/extractor/mx3.py index bf5cbaa0ee5..eb1e9383e07 100644 --- a/yt_dlp/extractor/mx3.py +++ b/yt_dlp/extractor/mx3.py @@ -1,14 +1,13 @@ import re from .common import InfoExtractor -from ..utils.traversal import traverse_obj +from ..networking import HEADRequest from ..utils import ( - urlhandle_detect_ext, - url_or_none, int_or_none, - unified_timestamp, + url_or_none, + urlhandle_detect_ext ) -from ..networking import HEADRequest +from ..utils.traversal import traverse_obj class Mx3BaseIE(InfoExtractor): @@ -17,30 +16,17 @@ class Mx3BaseIE(InfoExtractor): def _real_extract(self, url): track_id = self._match_id(url) webpage = self._download_webpage(url, track_id) - json = self._download_json(f'https://{self._MX3_DOMAIN}/t/{track_id}.json', track_id) - - artists = [] - if json.get('artist'): - artists.append(json['artist']) - performer = json.get('performer_name') - if performer and performer not in artists: - artists.append(performer) - - genre = self._html_search_regex(r']+class="single-band-genre"[^>]*>([^<]+)', - webpage, 'genre', fatal=False, flags=re.DOTALL) + data = self._download_json( + f'https://{self._MX3_DOMAIN}/t/{track_id}.json', track_id, fatal=False) formats = [] - def add_format(fmt, fatal): - if fatal: - urlh = self._request_webpage(HEADRequest(fmt['url']), track_id, note='Fetching default media headers') - else: - urlh = self._request_webpage(HEADRequest(fmt['url']), track_id, fatal=False, expected_status=404, - note=f'Trying media headers for optional format {fmt["format_id"]}') + def add_format(fmt): + urlh = self._request_webpage(HEADRequest(fmt['url']), track_id, fatal=False, expected_status=404, + note=f'Checking for format {fmt["format_id"]}') if urlh and urlh.status == 200: fmt['ext'] = urlhandle_detect_ext(urlh) fmt['filesize'] = int_or_none(urlh.headers.get('Content-Length')) - fmt['timestamp'] = unified_timestamp(urlh.headers.get('Last-Modified')) formats.append(fmt) track_url = f'https://{self._MX3_DOMAIN}/tracks/{track_id}' @@ -48,25 +34,34 @@ def add_format(fmt, fatal): 'url': f'{track_url}/player_asset', 'format_id': 'default', 'quality': 1, - }, fatal=True) + }) # the formats below don't always exist add_format({ 'url': f'{track_url}/player_asset?quality=hd', 'format_id': 'hd', 'quality': 10, - }, fatal=False) + }) add_format({ 'url': f'{track_url}/download', 'format_id': 'download', 'quality': 11, - }, fatal=False) + }) + + artists = [] + if data: + if data.get('artist'): + artists.append(data['artist']) + performer = data.get('performer_name') + if performer and performer not in artists: + artists.append(performer) return { 'id': track_id, 'formats': formats, 'artist': ', '.join(artists), - 'genre': genre, - **traverse_obj(json, { + 'genre': self._html_search_regex(r']+class="single-band-genre"[^>]*>([^<]+)', + webpage, 'genre', fatal=False, flags=re.DOTALL), + **traverse_obj(data, { 'title': ('title', {str}), 'composer': ('composer_name', {str}), 'thumbnail': (('picture_url_xlarge', 'picture_url'), {url_or_none}), @@ -89,7 +84,6 @@ class Mx3IE(Mx3BaseIE): 'genre': 'Rock', 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/4643/square_xlarge/1-s-envoler-1.jpg?1630272813', 'title': 'S\'envoler', - 'timestamp': 1630272831, } }, { 'url': 'https://mx3.ch/t/1LIY', @@ -103,7 +97,6 @@ class Mx3IE(Mx3BaseIE): 'genre': 'Electro', 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0110/0003/video_xlarge/frame_0000.png?1686963670', 'title': 'The Broots-Larytta remix "Begging For Help"', - 'timestamp': 1686963636, } }, { 'url': 'https://mx3.ch/t/1C6E', @@ -117,7 +110,6 @@ class Mx3IE(Mx3BaseIE): 'genre': 'Punk', 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/1551/square_xlarge/pandora-s-box-cover-with-title.png?1627054733', 'title': 'Wide Awake', - 'timestamp': 1627054732, } }] @@ -136,7 +128,6 @@ class Mx3NeoIE(Mx3BaseIE): 'genre': 'Composition, Orchestra', 'title': 'Troisième œil. Für Kammerorchester (2023)', 'thumbnail': 'https://neo.mx3.ch/pictures/neo/file/0000/0241/square_xlarge/kammerorchester-basel-group-photo-2_c_-lukasz-rajchert.jpg?1560341252', - 'timestamp': 1705055012, } }] @@ -155,6 +146,5 @@ class Mx3VolksmusikIE(Mx3BaseIE): 'genre': 'Instrumental, Graubünden', 'title': 'Chämilouf', 'thumbnail': 'https://volksmusik.mx3.ch/pictures/vxm/file/0000/3815/square_xlarge/grischart1.jpg?1450530120', - 'timestamp': 1450532809, } }] From b340d5b7211e47c5b2815c50c6dd05b98bdc92eb Mon Sep 17 00:00:00 2001 From: Martin Renold Date: Sat, 20 Jan 2024 12:11:50 +0100 Subject: [PATCH 09/14] [ie/mx3] Extract more fields, extract performer as artist Code as suggested by sepro; updated tests. --- yt_dlp/extractor/mx3.py | 50 +++++++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/yt_dlp/extractor/mx3.py b/yt_dlp/extractor/mx3.py index eb1e9383e07..8f8a22de5ce 100644 --- a/yt_dlp/extractor/mx3.py +++ b/yt_dlp/extractor/mx3.py @@ -3,9 +3,11 @@ from .common import InfoExtractor from ..networking import HEADRequest from ..utils import ( + get_element_by_class, int_or_none, + try_call, url_or_none, - urlhandle_detect_ext + urlhandle_detect_ext, ) from ..utils.traversal import traverse_obj @@ -47,22 +49,25 @@ def add_format(fmt): 'quality': 11, }) - artists = [] - if data: - if data.get('artist'): - artists.append(data['artist']) - performer = data.get('performer_name') - if performer and performer not in artists: - artists.append(performer) + more_info = get_element_by_class('single-more-info', webpage) + + def get_info_field(name): + return self._html_search_regex( + rf']*>\s*{name}\s*\s*]*>(.*?)', + more_info, name, default=None, flags=re.DOTALL) return { 'id': track_id, 'formats': formats, - 'artist': ', '.join(artists), - 'genre': self._html_search_regex(r']+class="single-band-genre"[^>]*>([^<]+)', - webpage, 'genre', fatal=False, flags=re.DOTALL), + 'genre': self._html_search_regex( + r']+class="single-band-genre"[^>]*>([^<]+)', webpage, 'genre', fatal=False), + 'release_year': int_or_none(get_info_field('Year of creation')), + 'description ': get_info_field('Description'), + 'tags': try_call(lambda: get_info_field('Tag').split(', '), list), **traverse_obj(data, { 'title': ('title', {str}), + 'artist': (('performer_name', 'artist'), {str}), + 'album_artist': ('artist', {str}), 'composer': ('composer_name', {str}), 'thumbnail': (('picture_url_xlarge', 'picture_url'), {url_or_none}), }, get_all=False), @@ -79,11 +84,14 @@ class Mx3IE(Mx3BaseIE): 'id': '1Cru', # This one is audio-only. It's a mp3, but we have to make a HEAD request to find out. 'ext': 'mp3', - 'artist': 'Tortue Tortue, Godina', + 'artist': 'Godina', + 'album_artist': 'Tortue Tortue', 'composer': 'Olivier Godinat', 'genre': 'Rock', 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/4643/square_xlarge/1-s-envoler-1.jpg?1630272813', 'title': 'S\'envoler', + 'release_year': 2021, + 'tags': [], } }, { 'url': 'https://mx3.ch/t/1LIY', @@ -92,11 +100,15 @@ class Mx3IE(Mx3BaseIE): 'id': '1LIY', # This is a music video. 'file' says: ISO Media, MP4 Base Media v1 [ISO 14496-12:2003] 'ext': 'mp4', - 'artist': 'The Broots, Tania Kimfumu', + 'artist': 'Tania Kimfumu', + 'album_artist': 'The Broots', 'composer': 'Emmanuel Diserens', 'genre': 'Electro', 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0110/0003/video_xlarge/frame_0000.png?1686963670', 'title': 'The Broots-Larytta remix "Begging For Help"', + 'release_year': 2023, + 'tags': ['the broots', 'cassata records', 'larytta'], + 'description ': '"Begging for Help" Larytta Remix Official Video\nRealized By Kali Donkilie in 2023', } }, { 'url': 'https://mx3.ch/t/1C6E', @@ -106,10 +118,13 @@ class Mx3IE(Mx3BaseIE): # This one has a download button, yielding a WAV. 'ext': 'wav', 'artist': 'Alien Bubblegum', + 'album_artist': 'Alien Bubblegum', 'composer': 'Alien Bubblegum', 'genre': 'Punk', 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/1551/square_xlarge/pandora-s-box-cover-with-title.png?1627054733', 'title': 'Wide Awake', + 'release_year': 2021, + 'tags': ['alien bubblegum', 'bubblegum', 'alien', 'pop punk', 'poppunk'], } }] @@ -123,11 +138,15 @@ class Mx3NeoIE(Mx3BaseIE): 'info_dict': { 'id': '1hpd', 'ext': 'mp3', - 'artist': 'Kammerorchester Basel, Baptiste Lopez', + 'artist': 'Baptiste Lopez', + 'album_artist': 'Kammerorchester Basel', 'composer': 'Jannik Giger', 'genre': 'Composition, Orchestra', 'title': 'Troisième œil. Für Kammerorchester (2023)', 'thumbnail': 'https://neo.mx3.ch/pictures/neo/file/0000/0241/square_xlarge/kammerorchester-basel-group-photo-2_c_-lukasz-rajchert.jpg?1560341252', + 'release_year': 2023, + 'tags': [], + 'description': None, # Not filled under ""there are elngthy is a lengthy description, but we fail to extract it currently } }] @@ -142,9 +161,12 @@ class Mx3VolksmusikIE(Mx3BaseIE): 'id': 'Zx', 'ext': 'mp3', 'artist': 'Ländlerkapelle GrischArt', + 'album_artist': 'Ländlerkapelle GrischArt', 'composer': 'Urs Glauser', 'genre': 'Instrumental, Graubünden', 'title': 'Chämilouf', 'thumbnail': 'https://volksmusik.mx3.ch/pictures/vxm/file/0000/3815/square_xlarge/grischart1.jpg?1450530120', + 'release_year': 2012, + 'tags': [], } }] From 43b916ded92ce66e5eddd319238bb8aee4702c40 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Sat, 20 Jan 2024 14:41:22 +0100 Subject: [PATCH 10/14] Small cleanup --- yt_dlp/extractor/mx3.py | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/yt_dlp/extractor/mx3.py b/yt_dlp/extractor/mx3.py index 8f8a22de5ce..b9533dbc658 100644 --- a/yt_dlp/extractor/mx3.py +++ b/yt_dlp/extractor/mx3.py @@ -24,8 +24,9 @@ def _real_extract(self, url): formats = [] def add_format(fmt): - urlh = self._request_webpage(HEADRequest(fmt['url']), track_id, fatal=False, expected_status=404, - note=f'Checking for format {fmt["format_id"]}') + urlh = self._request_webpage( + HEADRequest(fmt['url']), track_id, fatal=False, expected_status=404, + note=f'Checking for format {fmt["format_id"]}') if urlh and urlh.status == 200: fmt['ext'] = urlhandle_detect_ext(urlh) fmt['filesize'] = int_or_none(urlh.headers.get('Content-Length')) @@ -37,7 +38,6 @@ def add_format(fmt): 'format_id': 'default', 'quality': 1, }) - # the formats below don't always exist add_format({ 'url': f'{track_url}/player_asset?quality=hd', 'format_id': 'hd', @@ -48,6 +48,11 @@ def add_format(fmt): 'format_id': 'download', 'quality': 11, }) + add_format({ + 'url': f'{track_url}/player_asset?quality=source', + 'format_id': 'source', + 'quality': 11, + }) more_info = get_element_by_class('single-more-info', webpage) @@ -62,7 +67,7 @@ def get_info_field(name): 'genre': self._html_search_regex( r']+class="single-band-genre"[^>]*>([^<]+)', webpage, 'genre', fatal=False), 'release_year': int_or_none(get_info_field('Year of creation')), - 'description ': get_info_field('Description'), + 'description': get_info_field('Description'), 'tags': try_call(lambda: get_info_field('Tag').split(', '), list), **traverse_obj(data, { 'title': ('title', {str}), @@ -79,27 +84,25 @@ class Mx3IE(Mx3BaseIE): _VALID_URL = r'https?://(?:www\.)?mx3\.ch/t/(?P[0-9A-Za-z]+)' _TESTS = [{ 'url': 'https://mx3.ch/t/1Cru', - 'md5': '82510bf4c21f17da41bff7e1ffd84e78', + 'md5': '7ba09e9826b4447d4e1ce9d69e0e295f', 'info_dict': { 'id': '1Cru', - # This one is audio-only. It's a mp3, but we have to make a HEAD request to find out. - 'ext': 'mp3', + 'ext': 'wav', 'artist': 'Godina', 'album_artist': 'Tortue Tortue', 'composer': 'Olivier Godinat', 'genre': 'Rock', 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/4643/square_xlarge/1-s-envoler-1.jpg?1630272813', - 'title': 'S\'envoler', + 'title': "S'envoler", 'release_year': 2021, 'tags': [], } }, { 'url': 'https://mx3.ch/t/1LIY', - 'md5': '4117489dff8c763ecfbb0b95a67d6c8e', + 'md5': '48293cb908342547827f963a5a2e9118', 'info_dict': { 'id': '1LIY', - # This is a music video. 'file' says: ISO Media, MP4 Base Media v1 [ISO 14496-12:2003] - 'ext': 'mp4', + 'ext': 'mov', 'artist': 'Tania Kimfumu', 'album_artist': 'The Broots', 'composer': 'Emmanuel Diserens', @@ -108,14 +111,13 @@ class Mx3IE(Mx3BaseIE): 'title': 'The Broots-Larytta remix "Begging For Help"', 'release_year': 2023, 'tags': ['the broots', 'cassata records', 'larytta'], - 'description ': '"Begging for Help" Larytta Remix Official Video\nRealized By Kali Donkilie in 2023', + 'description': '"Begging for Help" Larytta Remix Official Video\nRealized By Kali Donkilie in 2023', } }, { 'url': 'https://mx3.ch/t/1C6E', 'md5': '1afcd578493ddb8e5008e94bb6d97e25', 'info_dict': { 'id': '1C6E', - # This one has a download button, yielding a WAV. 'ext': 'wav', 'artist': 'Alien Bubblegum', 'album_artist': 'Alien Bubblegum', @@ -134,10 +136,10 @@ class Mx3NeoIE(Mx3BaseIE): _VALID_URL = r'https?://(?:www\.)?neo.mx3\.ch/t/(?P[0-9A-Za-z]+)' _TESTS = [{ 'url': 'https://neo.mx3.ch/t/1hpd', - 'md5': 'ff0b2b91ce0b8931c0a358715758dc78', + 'md5': '6d9986bbae5cac3296ec8813bf965eb2', 'info_dict': { 'id': '1hpd', - 'ext': 'mp3', + 'ext': 'wav', 'artist': 'Baptiste Lopez', 'album_artist': 'Kammerorchester Basel', 'composer': 'Jannik Giger', @@ -146,7 +148,6 @@ class Mx3NeoIE(Mx3BaseIE): 'thumbnail': 'https://neo.mx3.ch/pictures/neo/file/0000/0241/square_xlarge/kammerorchester-basel-group-photo-2_c_-lukasz-rajchert.jpg?1560341252', 'release_year': 2023, 'tags': [], - 'description': None, # Not filled under ""there are elngthy is a lengthy description, but we fail to extract it currently } }] From 272551dd39d6066161c2ce7c40faa922c91985f0 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 21 Jan 2024 02:25:46 +0000 Subject: [PATCH 11/14] refactor --- yt_dlp/extractor/mx3.py | 86 ++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 44 deletions(-) diff --git a/yt_dlp/extractor/mx3.py b/yt_dlp/extractor/mx3.py index b9533dbc658..71870cdc088 100644 --- a/yt_dlp/extractor/mx3.py +++ b/yt_dlp/extractor/mx3.py @@ -13,59 +13,57 @@ class Mx3BaseIE(InfoExtractor): - _MX3_DOMAIN = None - - def _real_extract(self, url): - track_id = self._match_id(url) - webpage = self._download_webpage(url, track_id) - data = self._download_json( - f'https://{self._MX3_DOMAIN}/t/{track_id}.json', track_id, fatal=False) + _VALID_URL_TMPL = r'https?://(?:www\.)?%s/t/(?P\w+)' + _FORMATS = [{ + 'url': 'player_asset', + 'format_id': 'default', + 'quality': 1, + }, { + 'url': 'player_asset?quality=hd', + 'format_id': 'hd', + 'quality': 10, + }, { + 'url': 'download', + 'format_id': 'download', + 'quality': 11, + }, { + 'url': 'player_asset?quality=source', + 'format_id': 'source', + 'quality': 11, + }] + def _extract_formats(self, track_id): formats = [] - - def add_format(fmt): + for fmt in self._FORMATS: + format_url = f'https://{self._DOMAIN}/tracks/{track_id}/{fmt["url"]}' urlh = self._request_webpage( - HEADRequest(fmt['url']), track_id, fatal=False, expected_status=404, + HEADRequest(format_url), track_id, fatal=False, expected_status=404, note=f'Checking for format {fmt["format_id"]}') if urlh and urlh.status == 200: - fmt['ext'] = urlhandle_detect_ext(urlh) - fmt['filesize'] = int_or_none(urlh.headers.get('Content-Length')) - formats.append(fmt) - - track_url = f'https://{self._MX3_DOMAIN}/tracks/{track_id}' - add_format({ - 'url': f'{track_url}/player_asset', - 'format_id': 'default', - 'quality': 1, - }) - add_format({ - 'url': f'{track_url}/player_asset?quality=hd', - 'format_id': 'hd', - 'quality': 10, - }) - add_format({ - 'url': f'{track_url}/download', - 'format_id': 'download', - 'quality': 11, - }) - add_format({ - 'url': f'{track_url}/player_asset?quality=source', - 'format_id': 'source', - 'quality': 11, - }) + formats.append({ + **fmt, + 'url': format_url, + 'ext': urlhandle_detect_ext(urlh), + 'filesize': int_or_none(urlh.headers.get('Content-Length')), + }) + return formats + def _real_extract(self, url): + track_id = self._match_id(url) + webpage = self._download_webpage(url, track_id) more_info = get_element_by_class('single-more-info', webpage) + data = self._download_json(f'https://{self._DOMAIN}/t/{track_id}.json', track_id, fatal=False) def get_info_field(name): return self._html_search_regex( - rf']*>\s*{name}\s*\s*]*>(.*?)', + rf']*>\s*{name}\s*\s*]*>(.+?)', more_info, name, default=None, flags=re.DOTALL) return { 'id': track_id, - 'formats': formats, + 'formats': self._extract_formats(track_id), 'genre': self._html_search_regex( - r']+class="single-band-genre"[^>]*>([^<]+)', webpage, 'genre', fatal=False), + r']+class="single-band-genre"[^>]*>([^<]+)', webpage, 'genre', default=None), 'release_year': int_or_none(get_info_field('Year of creation')), 'description': get_info_field('Description'), 'tags': try_call(lambda: get_info_field('Tag').split(', '), list), @@ -80,8 +78,8 @@ def get_info_field(name): class Mx3IE(Mx3BaseIE): - _MX3_DOMAIN = 'mx3.ch' - _VALID_URL = r'https?://(?:www\.)?mx3\.ch/t/(?P[0-9A-Za-z]+)' + _DOMAIN = 'mx3.ch' + _VALID_URL = _Mx3BaseIE._VALID_URL_TMPL % re.escape(_DOMAIN) _TESTS = [{ 'url': 'https://mx3.ch/t/1Cru', 'md5': '7ba09e9826b4447d4e1ce9d69e0e295f', @@ -132,8 +130,8 @@ class Mx3IE(Mx3BaseIE): class Mx3NeoIE(Mx3BaseIE): - _MX3_DOMAIN = 'neo.mx3.ch' - _VALID_URL = r'https?://(?:www\.)?neo.mx3\.ch/t/(?P[0-9A-Za-z]+)' + _DOMAIN = 'neo.mx3.ch' + _VALID_URL = _Mx3BaseIE._VALID_URL_TMPL % re.escape(_DOMAIN) _TESTS = [{ 'url': 'https://neo.mx3.ch/t/1hpd', 'md5': '6d9986bbae5cac3296ec8813bf965eb2', @@ -153,8 +151,8 @@ class Mx3NeoIE(Mx3BaseIE): class Mx3VolksmusikIE(Mx3BaseIE): - _MX3_DOMAIN = 'volksmusik.mx3.ch' - _VALID_URL = r'https?://(?:www\.)?volksmusik.mx3\.ch/t/(?P[0-9A-Za-z]+)' + _DOMAIN = 'volksmusik.mx3.ch' + _VALID_URL = _Mx3BaseIE._VALID_URL_TMPL % re.escape(_DOMAIN) _TESTS = [{ 'url': 'https://volksmusik.mx3.ch/t/Zx', 'md5': 'dd967a7b0c1ef898f3e072cf9c2eae3c', From 212ff27cb59b69bad24a2e675d97e52ead5b12e8 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 21 Jan 2024 02:27:27 +0000 Subject: [PATCH 12/14] qualities --- yt_dlp/extractor/mx3.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/mx3.py b/yt_dlp/extractor/mx3.py index 71870cdc088..b4fd5ab72e3 100644 --- a/yt_dlp/extractor/mx3.py +++ b/yt_dlp/extractor/mx3.py @@ -17,19 +17,19 @@ class Mx3BaseIE(InfoExtractor): _FORMATS = [{ 'url': 'player_asset', 'format_id': 'default', - 'quality': 1, + 'quality': 0, }, { 'url': 'player_asset?quality=hd', 'format_id': 'hd', - 'quality': 10, + 'quality': 1, }, { 'url': 'download', 'format_id': 'download', - 'quality': 11, + 'quality': 2, }, { 'url': 'player_asset?quality=source', 'format_id': 'source', - 'quality': 11, + 'quality': 2, }] def _extract_formats(self, track_id): From 1cb1df5d401172bb7b6770605a72fdb9bd03b0a8 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 21 Jan 2024 02:29:23 +0000 Subject: [PATCH 13/14] oops --- yt_dlp/extractor/mx3.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/mx3.py b/yt_dlp/extractor/mx3.py index b4fd5ab72e3..e39334a6e2e 100644 --- a/yt_dlp/extractor/mx3.py +++ b/yt_dlp/extractor/mx3.py @@ -79,7 +79,7 @@ def get_info_field(name): class Mx3IE(Mx3BaseIE): _DOMAIN = 'mx3.ch' - _VALID_URL = _Mx3BaseIE._VALID_URL_TMPL % re.escape(_DOMAIN) + _VALID_URL = Mx3BaseIE._VALID_URL_TMPL % re.escape(_DOMAIN) _TESTS = [{ 'url': 'https://mx3.ch/t/1Cru', 'md5': '7ba09e9826b4447d4e1ce9d69e0e295f', @@ -131,7 +131,7 @@ class Mx3IE(Mx3BaseIE): class Mx3NeoIE(Mx3BaseIE): _DOMAIN = 'neo.mx3.ch' - _VALID_URL = _Mx3BaseIE._VALID_URL_TMPL % re.escape(_DOMAIN) + _VALID_URL = Mx3BaseIE._VALID_URL_TMPL % re.escape(_DOMAIN) _TESTS = [{ 'url': 'https://neo.mx3.ch/t/1hpd', 'md5': '6d9986bbae5cac3296ec8813bf965eb2', @@ -152,7 +152,7 @@ class Mx3NeoIE(Mx3BaseIE): class Mx3VolksmusikIE(Mx3BaseIE): _DOMAIN = 'volksmusik.mx3.ch' - _VALID_URL = _Mx3BaseIE._VALID_URL_TMPL % re.escape(_DOMAIN) + _VALID_URL = Mx3BaseIE._VALID_URL_TMPL % re.escape(_DOMAIN) _TESTS = [{ 'url': 'https://volksmusik.mx3.ch/t/Zx', 'md5': 'dd967a7b0c1ef898f3e072cf9c2eae3c', From 336e2f1da0e7d19cad70f5dfcc449e61e9da53a8 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 21 Jan 2024 02:31:32 +0000 Subject: [PATCH 14/14] revert `get_info_field` regex change --- yt_dlp/extractor/mx3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/mx3.py b/yt_dlp/extractor/mx3.py index e39334a6e2e..cb9f50e0cfe 100644 --- a/yt_dlp/extractor/mx3.py +++ b/yt_dlp/extractor/mx3.py @@ -56,7 +56,7 @@ def _real_extract(self, url): def get_info_field(name): return self._html_search_regex( - rf']*>\s*{name}\s*\s*]*>(.+?)', + rf']*>\s*{name}\s*\s*]*>(.*?)', more_info, name, default=None, flags=re.DOTALL) return {