Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[thirtydaysinger] Add Extractor #9894

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions yt_dlp/extractor/_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1981,6 +1981,10 @@
from .thestar import TheStarIE
from .thesun import TheSunIE
from .theweatherchannel import TheWeatherChannelIE
from .thirtydaysinger import (
ThirtyDaySingerIE,
ThirtyDaySingerPlaylistIE
)
from .thisamericanlife import ThisAmericanLifeIE
from .thisoldhouse import ThisOldHouseIE
from .thisvid import (
Expand Down
102 changes: 102 additions & 0 deletions yt_dlp/extractor/thirtydaysinger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
from .wistia import WistiaIE
from ..utils import (
clean_html,
get_elements_html_by_class
)


class ThirtyDaySingerBase(WistiaIE):
kieraneglin marked this conversation as resolved.
Show resolved Hide resolved
def _extract_for_url(self, url):
lesson_index = self._match_id(url)
webpage = self._download_webpage(url, lesson_index)
match = next(self._extract_wistia_async_embed(webpage))
embed_config = self._download_embed_config('medias', match.group('id'), url)

embed_infojson = self._extract_media(embed_config)
webpage_infojson = self._extract_webpage_data(webpage)

return {**embed_infojson, **webpage_infojson}

def _extract_webpage_data(self, webpage):
title = self._html_search_regex(r'<h1>([^<]+)</h1>', webpage, 'title')
fallback_title = self._html_extract_title(webpage)
description = self._html_search_meta('description', webpage, fatal=False)

return {
'title': title or fallback_title,
'description': clean_html(self._format_html_list(description))
}

# The site makes extensive use of HTML lists for formatting and `clean_html`
# doesn't handle them well. This is needed to keep lists readable.
def _format_html_list(self, html):
replacements = {
'<ul>': '<br>',
'</ul': '<br>',
'<li>': '<br>- ',
'</li>': ''
}

for k, v in replacements.items():
html = html.replace(k, v)

return html


class ThirtyDaySingerIE(ThirtyDaySingerBase):
_VALID_URL = r'(https?://)?www.30daysinger.com/tutorial/[\w-]+/(?P<id>[\w-]+)'
kieraneglin marked this conversation as resolved.
Show resolved Hide resolved

_TESTS = [{
'url': 'https://www.30daysinger.com/tutorial/30-day-beginner-course-with-jonathan-estabrooks/1',
'md5': '56bb11529b9777899b27b599d4b16cf6',
'info_dict': {
'id': 'tegd38l3d5',
'ext': 'mp4',
'thumbnail': 'http://embed.wistia.com/deliveries/c26a85cb98e32efa8a5e12a0576e63355af66230.jpg',
'upload_date': '20190608',
'description': 'md5:d3291de8988be57b1d3e411126ba4d33',
'duration': 344.22,
'timestamp': 1559952526,
'title': 'Welcome to 30 Day Singer'
}
}]

def _real_extract(self, url):
return self._extract_for_url(url)


class ThirtyDaySingerPlaylistIE(ThirtyDaySingerBase):
_URI_BASE = 'https://www.30daysinger.com'
_VALID_URL = r'(https?://)?www.30daysinger.com/tutorial/(?P<id>[\w-]+)'
bashonly marked this conversation as resolved.
Show resolved Hide resolved

_TESTS = [{
'url': 'https://www.30daysinger.com/tutorial/30-day-beginner-course-with-jonathan-estabrooks',
'info_dict': {
'id': '30-day-beginner-course-with-jonathan-estabrooks',
'description': 'md5:8cf6d6c7c377895653c9cde9dfc4104f',
'title': '30 Day Beginner Course with Jonathan Estabrooks',
},
'playlist_count': 1
}]

def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
playlist_attrs = self._extract_webpage_data(webpage)

entries = []
for html_element in get_elements_html_by_class('playlist-item-link', webpage):
href = self._search_regex(r'href="([^"]+)"', html_element, 'href')

if not href:
continue
# Often _some_ content is free so we should still download that but warn the user
# when we encounter premium content.
# NOTE: this only applies to the playlist extractor, not the single video extractor
if 'upgrade' in href:
self.report_warning('This video is for premium members only')
continue

entries.append(self._extract_for_url(self._URI_BASE + href))

return self.playlist_result(entries, playlist_id, playlist_attrs['title'], playlist_attrs['description'])