Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[extractor/mx3] Add extractor #8736

Merged
merged 14 commits into from Jan 21, 2024
5 changes: 5 additions & 0 deletions yt_dlp/extractor/_extractors.py
Expand Up @@ -1124,6 +1124,11 @@
MusicdexArtistIE,
MusicdexPlaylistIE,
)
from .mx3 import (
Mx3IE,
Mx3NeoIE,
Mx3VolksmusikIE,
)
from .mxplayer import (
MxplayerIE,
MxplayerShowIE,
Expand Down
160 changes: 160 additions & 0 deletions yt_dlp/extractor/mx3.py
@@ -0,0 +1,160 @@
import re

from .common import InfoExtractor
from ..utils.traversal import traverse_obj
from ..utils import (
urlhandle_detect_ext,
url_or_none,
int_or_none,
unified_timestamp,
)
from ..networking import HEADRequest
martinxyz marked this conversation as resolved.
Show resolved Hide resolved


class Mx3BaseIE(InfoExtractor):
_MX3_DOMAIN = None

def _real_extract(self, url):
track_id = self._match_id(url)
webpage = self._download_webpage(url, track_id)
json = self._download_json(f'https://{self._MX3_DOMAIN}/t/{track_id}.json', track_id)
martinxyz marked this conversation as resolved.
Show resolved Hide resolved

artists = []
if json.get('artist'):
artists.append(json['artist'])
performer = json.get('performer_name')
if performer and performer not in artists:
artists.append(performer)

genre = self._html_search_regex(r'<div\b[^>]+class="single-band-genre"[^>]*>([^<]+)</div>',
webpage, 'genre', fatal=False, flags=re.DOTALL)

martinxyz marked this conversation as resolved.
Show resolved Hide resolved
formats = []

def add_format(fmt, fatal):
if fatal:
urlh = self._request_webpage(HEADRequest(fmt['url']), track_id, note='Fetching default media headers')
else:
urlh = self._request_webpage(HEADRequest(fmt['url']), track_id, fatal=False, expected_status=404,
note=f'Trying media headers for optional format {fmt["format_id"]}')
martinxyz marked this conversation as resolved.
Show resolved Hide resolved
if urlh and urlh.status == 200:
fmt['ext'] = urlhandle_detect_ext(urlh)
fmt['filesize'] = int_or_none(urlh.headers.get('Content-Length'))
fmt['timestamp'] = unified_timestamp(urlh.headers.get('Last-Modified'))
martinxyz marked this conversation as resolved.
Show resolved Hide resolved
formats.append(fmt)

track_url = f'https://{self._MX3_DOMAIN}/tracks/{track_id}'
add_format({
'url': f'{track_url}/player_asset',
'format_id': 'default',
'quality': 1,
}, fatal=True)
# the formats below don't always exist
add_format({
'url': f'{track_url}/player_asset?quality=hd',
'format_id': 'hd',
'quality': 10,
}, fatal=False)
add_format({
'url': f'{track_url}/download',
'format_id': 'download',
'quality': 11,
}, fatal=False)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

imo. yt-dlp should always download the highest quality format by default. We also prefer the source file on other extractors like Vimeo. If you don't want to download the highest quality format, you can use -f or -S.

Suggested change
add_format({
'url': f'{track_url}/player_asset',
'format_id': 'default',
'quality': 1,
}, fatal=True)
# the formats below don't always exist
add_format({
'url': f'{track_url}/player_asset?quality=hd',
'format_id': 'hd',
'quality': 10,
}, fatal=False)
add_format({
'url': f'{track_url}/download',
'format_id': 'download',
'quality': 11,
}, fatal=False)
add_format({
'url': f'{track_url}/player_asset',
'format_id': 'default',
'quality': 1,
})
add_format({
'url': f'{track_url}/player_asset?quality=hd',
'format_id': 'hd',
'quality': 10,
})
add_format({
'url': f'{track_url}/download',
'format_id': 'download',
'quality': 11,
})
add_format({
'url': f'{track_url}/player_asset?quality=source',
'format_id': 'source',
'quality': 11,
})

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm. I can see your point. But for me there is simply no difference in quality between a high-bitrate MP3 and WAV. For video I would list the formats and pick one manually, but for music I use the "Open With" browser extension (non-interactive) and check the download folder later. I'm going to use -f"best[ext!=wav][ext!=flac][filesize<50M]" -x now, so it will work for me if you add the format.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you only want the format selection for this site, just use -f hd/default. This will download hd if available and otherwise fallback to default.


return {
'id': track_id,
'formats': formats,
'artist': ', '.join(artists),
'genre': genre,
**traverse_obj(json, {
'title': ('title', {str}),
'composer': ('composer_name', {str}),
'thumbnail': (('picture_url_xlarge', 'picture_url'), {url_or_none}),
}, get_all=False),
}
Copy link
Collaborator

@seproDev seproDev Jan 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, I wrote a function to extract more metadata fields. For artist/performer, how about we split this across artist and album_artist, with a fallback for artist.

Suggested change
return {
'id': track_id,
'formats': formats,
'artist': ', '.join(artists),
'genre': genre,
**traverse_obj(json, {
'title': ('title', {str}),
'composer': ('composer_name', {str}),
'thumbnail': (('picture_url_xlarge', 'picture_url'), {url_or_none}),
}, get_all=False),
}
more_info = get_element_by_class('single-more-info', webpage)
def get_info_field(name):
return self._html_search_regex(
rf'<dt[^>]*>\s*{name}\s*</dt>\s*<dd[^>]*>(.*?)</dd>',
more_info, name, default=None, flags=re.DOTALL)
return {
'id': track_id,
'formats': formats,
'genre': self._html_search_regex(
r'<div\b[^>]+class="single-band-genre"[^>]*>([^<]+)</div>', webpage, 'genre', fatal=False),
'release_year': int_or_none(get_info_field('Year of creation')),
'description ': get_info_field('Description'),
'tags': try_call(lambda: get_info_field('Tag').split(', '), list),
**traverse_obj(data, {
'title': ('title', {str}),
'artist': (('performer_name', 'artist'), {str}),
'album_artist': ('artist', {str}),
'composer': ('composer_name', {str}),
'thumbnail': (('picture_url_xlarge', 'picture_url'), {url_or_none}),
}, get_all=False),
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The artist/album_artist split seem to fit pretty well, I like it. I slightly preferred the filenames I got previously with the format string, I'll get a few duplicated artist names now but it's not too bad really.

I've updated the tests to match. I noticed that https://neo.mx3.ch/t/1hpd kind of has a description, but they put it all into the credits field, not sure if we want to add that.



class Mx3IE(Mx3BaseIE):
_MX3_DOMAIN = 'mx3.ch'
_VALID_URL = r'https?://(?:www\.)?mx3\.ch/t/(?P<id>[0-9A-Za-z]+)'
seproDev marked this conversation as resolved.
Show resolved Hide resolved
_TESTS = [{
'url': 'https://mx3.ch/t/1Cru',
'md5': '82510bf4c21f17da41bff7e1ffd84e78',
'info_dict': {
'id': '1Cru',
# This one is audio-only. It's a mp3, but we have to make a HEAD request to find out.
'ext': 'mp3',
'artist': 'Tortue Tortue, Godina',
'composer': 'Olivier Godinat',
'genre': 'Rock',
'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/4643/square_xlarge/1-s-envoler-1.jpg?1630272813',
'title': 'S\'envoler',
'timestamp': 1630272831,
}
}, {
'url': 'https://mx3.ch/t/1LIY',
'md5': '4117489dff8c763ecfbb0b95a67d6c8e',
'info_dict': {
'id': '1LIY',
# This is a music video. 'file' says: ISO Media, MP4 Base Media v1 [ISO 14496-12:2003]
'ext': 'mp4',
'artist': 'The Broots, Tania Kimfumu',
'composer': 'Emmanuel Diserens',
'genre': 'Electro',
'thumbnail': 'https://mx3.ch/pictures/mx3/file/0110/0003/video_xlarge/frame_0000.png?1686963670',
'title': 'The Broots-Larytta remix "Begging For Help"',
'timestamp': 1686963636,
}
}, {
'url': 'https://mx3.ch/t/1C6E',
'md5': '1afcd578493ddb8e5008e94bb6d97e25',
'info_dict': {
'id': '1C6E',
# This one has a download button, yielding a WAV.
'ext': 'wav',
'artist': 'Alien Bubblegum',
'composer': 'Alien Bubblegum',
'genre': 'Punk',
'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/1551/square_xlarge/pandora-s-box-cover-with-title.png?1627054733',
'title': 'Wide Awake',
'timestamp': 1627054732,
}
}]


class Mx3NeoIE(Mx3BaseIE):
_MX3_DOMAIN = 'neo.mx3.ch'
_VALID_URL = r'https?://(?:www\.)?neo.mx3\.ch/t/(?P<id>[0-9A-Za-z]+)'
_TESTS = [{
'url': 'https://neo.mx3.ch/t/1hpd',
'md5': 'ff0b2b91ce0b8931c0a358715758dc78',
'info_dict': {
'id': '1hpd',
'ext': 'mp3',
'artist': 'Kammerorchester Basel, Baptiste Lopez',
'composer': 'Jannik Giger',
'genre': 'Composition, Orchestra',
'title': 'Troisième œil. Für Kammerorchester (2023)',
'thumbnail': 'https://neo.mx3.ch/pictures/neo/file/0000/0241/square_xlarge/kammerorchester-basel-group-photo-2_c_-lukasz-rajchert.jpg?1560341252',
'timestamp': 1705055012,
}
}]


class Mx3VolksmusikIE(Mx3BaseIE):
_MX3_DOMAIN = 'volksmusik.mx3.ch'
_VALID_URL = r'https?://(?:www\.)?volksmusik.mx3\.ch/t/(?P<id>[0-9A-Za-z]+)'
_TESTS = [{
'url': 'https://volksmusik.mx3.ch/t/Zx',
'md5': 'dd967a7b0c1ef898f3e072cf9c2eae3c',
'info_dict': {
'id': 'Zx',
'ext': 'mp3',
'artist': 'Ländlerkapelle GrischArt',
'composer': 'Urs Glauser',
'genre': 'Instrumental, Graubünden',
'title': 'Chämilouf',
'thumbnail': 'https://volksmusik.mx3.ch/pictures/vxm/file/0000/3815/square_xlarge/grischart1.jpg?1450530120',
'timestamp': 1450532809,
}
}]