Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[extractor/rheinmaintv] Add extractor #7311

Merged
merged 39 commits into from Jun 22, 2023
Merged
Show file tree
Hide file tree
Changes from 36 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
3242430
Add files via upload
barthelmannk Dec 19, 2022
fe5a5b5
Update _extractors.py
barthelmannk Dec 19, 2022
11a29f8
Update common.py
barthelmannk Dec 19, 2022
9b43787
Update rheinmaintv.py
barthelmannk Dec 19, 2022
6508252
Update rheinmaintv.py
barthelmannk Dec 20, 2022
d4eb78a
Update common.py
barthelmannk Dec 20, 2022
b06bf38
Update rheinmaintv.py
barthelmannk Dec 20, 2022
68a9421
Update rheinmaintv.py
barthelmannk Dec 21, 2022
e5aaa5f
Update rheinmaintv.py
barthelmannk Dec 21, 2022
93f9432
Update rheinmaintv.py
barthelmannk Dec 22, 2022
a500e20
Update rheinmaintv.py
barthelmannk Dec 22, 2022
164827c
Update rheinmaintv.py
barthelmannk Dec 29, 2022
7224c30
Update rheinmaintv.py
barthelmannk Dec 29, 2022
6a04563
Update rheinmaintv.py
barthelmannk Dec 30, 2022
c91da52
Update rheinmaintv.py
barthelmannk Dec 30, 2022
542cfa5
Update common.py
barthelmannk Jan 4, 2023
bc9503d
Update rheinmaintv.py
barthelmannk Jan 4, 2023
b4fe90b
Merge branch 'master' into rheinmaintv
barthelmannk Jan 4, 2023
34cd757
Update YoutubeDL.py
barthelmannk Jan 4, 2023
636e869
Update YoutubeDL.py
barthelmannk Jan 5, 2023
599a9b7
Update rheinmaintv.py
barthelmannk Jan 5, 2023
bf4fdb7
Update common.py
barthelmannk Jan 5, 2023
fa85dcf
Update YoutubeDL.py
barthelmannk Jan 11, 2023
95a84e0
Update rheinmaintv.py
barthelmannk Jan 11, 2023
84a959e
Apply suggestions from code review
Lesmiscore Jan 12, 2023
64c71f1
Update rheinmaintv.py
barthelmannk Jan 12, 2023
cf015ad
Revert core changes
bashonly Jun 14, 2023
2e72388
Revert ext hack
bashonly Jun 14, 2023
b3f3e40
Test cleanup part 1
bashonly Jun 14, 2023
01a49eb
Merge branch 'yt-dlp:master' into pr/rhein
bashonly Jun 14, 2023
af3f553
Cleanup tests part 2
bashonly Jun 14, 2023
08d65a6
[utils] Fix case bug in `sanitize_codec`
bashonly Jun 14, 2023
60d531e
Add test comment
bashonly Jun 14, 2023
937b761
Cleanup
bashonly Jun 14, 2023
97e4572
Relax/cleanup `_VALID_URL` regex
bashonly Jun 14, 2023
eb98f41
Add test for updated `_VALID_URL`
bashonly Jun 15, 2023
07cb271
remove superfluous comment
bashonly Jun 22, 2023
9b23e79
revert change to _utils.py
bashonly Jun 22, 2023
011d56b
merge dicts
bashonly Jun 22, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions yt_dlp/extractor/_extractors.py
Expand Up @@ -1612,6 +1612,7 @@
from .restudy import RestudyIE
from .reuters import ReutersIE
from .reverbnation import ReverbNationIE
from .rheinmaintv import RheinMainTVIE
from .rice import RICEIE
from .rmcdecouverte import RMCDecouverteIE
from .rockstargames import RockstarGamesIE
Expand Down
100 changes: 100 additions & 0 deletions yt_dlp/extractor/rheinmaintv.py
@@ -0,0 +1,100 @@
from .common import InfoExtractor
from ..utils import extract_attributes, remove_end, traverse_obj


class RheinMainTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?rheinmaintv\.de/sendungen/(?:[\w-]+/)*(?P<video_id>(?P<display_id>[\w-]+)/vom-\d{2}\.\d{2}\.\d{4}(?:/\d+)?)'
_TESTS = [{
'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/auf-dem-weg-zur-deutschen-meisterschaft/vom-07.11.2022/',
'info_dict': {
'id': 'auf-dem-weg-zur-deutschen-meisterschaft-vom-07.11.2022',
'ext': 'ismv', # ismv+isma will be merged into mp4
'alt_title': 'Auf dem Weg zur Deutschen Meisterschaft',
'title': 'Auf dem Weg zur Deutschen Meisterschaft',
'upload_date': '20221108',
'view_count': int,
'display_id': 'auf-dem-weg-zur-deutschen-meisterschaft',
'thumbnail': r're:^https://.+\.jpg',
'description': 'md5:48c59b74192bc819a9b34af1d5ed1eb9',
'timestamp': 1667933057,
'duration': 243.0,
},
'params': {'skip_download': 'ism'},
}, {
'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/formationsgemeinschaft-rhein-main-bei-den-deutschen-meisterschaften/vom-14.11.2022/',
'info_dict': {
'id': 'formationsgemeinschaft-rhein-main-bei-den-deutschen-meisterschaften-vom-14.11.2022',
'ext': 'ismv',
'title': 'Formationsgemeinschaft Rhein-Main bei den Deutschen Meisterschaften',
'timestamp': 1668526214,
'display_id': 'formationsgemeinschaft-rhein-main-bei-den-deutschen-meisterschaften',
'alt_title': 'Formationsgemeinschaft Rhein-Main bei den Deutschen Meisterschaften',
'view_count': int,
'thumbnail': r're:^https://.+\.jpg',
'duration': 345.0,
'description': 'md5:9370ba29526984006c2cba1372e5c5a0',
'upload_date': '20221115',
},
'params': {'skip_download': 'ism'},
}, {
'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/casino-mainz-bei-den-deutschen-meisterschaften/vom-14.11.2022/',
'info_dict': {
'id': 'casino-mainz-bei-den-deutschen-meisterschaften-vom-14.11.2022',
'ext': 'ismv',
'title': 'Casino Mainz bei den Deutschen Meisterschaften',
'view_count': int,
'timestamp': 1668527402,
'alt_title': 'Casino Mainz bei den Deutschen Meisterschaften',
'upload_date': '20221115',
'display_id': 'casino-mainz-bei-den-deutschen-meisterschaften',
'duration': 348.0,
'thumbnail': r're:^https://.+\.jpg',
'description': 'md5:70fc1660eeba96da17199e5bdff4c0aa',
},
'params': {'skip_download': 'ism'},
}, {
'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/bricks4kids/vom-22.06.2022/',
'only_matching': True,
}]

def _real_extract(self, url):
mobj = self._match_valid_url(url)
display_id = mobj.group('display_id')
video_id = mobj.group('video_id').replace('/', '-')
webpage = self._download_webpage(url, video_id)

source, img = self._search_regex(r'(?s)(?P<source><source[^>]*>)(?P<img><img[^>]*>)',
webpage, 'video', group=('source', 'img'))
source = extract_attributes(source)
img = extract_attributes(img)

# Work around the method self._json_ld (called by self._search_json_ld), which
# extracts the useless 'contentUrl' (as 'url') instead of the essential 'embedUrl'.
bashonly marked this conversation as resolved.
Show resolved Hide resolved
raw_json_ld = list(self._yield_json_ld(webpage, video_id))
json_ld = self._json_ld(raw_json_ld, video_id)

ism_manifest_url = (
source.get('src')
or next(json_ld.get('embedUrl') for json_ld in raw_json_ld if json_ld.get('@type') == 'VideoObject')
)
formats, subtitles = self._extract_ism_formats_and_subtitles(ism_manifest_url, video_id)

return {
'id': video_id,
'display_id': display_id,
'title':
self._html_search_regex(r'<h1><span class="title">([^<]*)</span>',
webpage, 'headline', default=None)
or img.get('title') or json_ld.get('title') or self._og_search_title(webpage)
or remove_end(self._html_extract_title(webpage), ' -'),
'alt_title': img.get('alt'),
'description': json_ld.get('description') or self._og_search_description(webpage),
'formats': formats,
'subtitles': subtitles,
'thumbnails': [{'url': img['src']}] if 'src' in img else json_ld.get('thumbnails'),
**traverse_obj(json_ld, {
'timestamp': 'timestamp',
'duration': 'duration',
'view_count': 'view_count',
}),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not merge_dict with the whole thing?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the original PR author wanted to mix and match, they wanted to give priority to JSON LD for certain fields but also wanted non-JSON LD fallbacks for those (description, title)

But yeah ig merging the whole thing with JSON LD as secondary dict couldn't hurt

}
3 changes: 2 additions & 1 deletion yt_dlp/utils/_utils.py
Expand Up @@ -3502,7 +3502,8 @@ def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
},
}

sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
sanitize_codec = functools.partial(
try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll merge this separately

vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)

for ext in preferences or COMPATIBLE_CODECS.keys():
Expand Down