Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[hketv] Add new extractor #18696

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions youtube_dl/extractor/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,7 @@
from .helsinki import HelsinkiIE
from .hentaistigma import HentaiStigmaIE
from .hgtv import HGTVComShowIE
from .hketv import HKETVIE
from .hidive import HiDiveIE
from .historicfilms import HistoricFilmsIE
from .hitbox import HitboxIE, HitboxLiveIE
Expand Down
160 changes: 160 additions & 0 deletions youtube_dl/extractor/hketv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# coding: utf-8
from __future__ import unicode_literals

from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
ExtractorError,
anthonyfok marked this conversation as resolved.
Show resolved Hide resolved
GeoRestrictedError,
clean_html,
int_or_none,
merge_dicts,
str_to_int,
try_get,
unified_strdate,
urlencode_postdata,
urljoin,
)


class HKETVIE(InfoExtractor):
IE_NAME = 'hketv'
IE_DESC = '香港教育局教育電視 (HKETV) Educational Television, Hong Kong Educational Bureau'
_VALID_URL = r'https?://(?:www\.)?hkedcity\.net/etv/resource/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'https://www.hkedcity.net/etv/resource/2932360618',
'md5': 'f193712f5f7abb208ddef3c5ea6ed0b7',
'info_dict': {
'id': '2932360618',
'ext': 'mp4',
'title': '喜閱一生(共享閱讀樂)',
'description': '本節目輯錄了「閱讀滿Fun嘉年華」和「二○一八響應世界閱讀日――悅愛閱讀・愈讀愈愛」的活動花絮,並由學者、作家、演藝界人士等,分享培養子女閱讀興趣和習慣的方法,以及呼籲大家一同分享閱讀的樂趣。',
'upload_date': '20181024',
'duration': 900,
'subtitles': {
'en': [{
'url': 'https://apps.hkedcity.net/media/mediaplayer/caption.php?f=74395&lang=en',
'ext': 'srt',
}],
'zh-Hant': [{
'url': 'https://apps.hkedcity.net/media/mediaplayer/caption.php?f=74395&lang=qmt',
'ext': 'srt',
}],
}
},
}]

_CC_LANGS = {
'中文(繁體中文)': 'zh-Hant',
'中文(简体中文)': 'zh-Hans',
'English': 'en',
'Bahasa Indonesia': 'id',
'\u0939\u093f\u0928\u094d\u0926\u0940': 'hi',
'\u0928\u0947\u092a\u093e\u0932\u0940': 'ne',
'Tagalog': 'tl',
'\u0e44\u0e17\u0e22': 'th',
'\u0627\u0631\u062f\u0648': 'ur',
}

def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)

file_id = self._html_search_regex(r'post_var\["file_id"\] = ([0-9]+);', webpage, 'file ID')
curr_url = self._html_search_regex(r'post_var\["curr_url"\] = "(.+?)";', webpage, 'curr URL')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Relax regexes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed the regex to r'post_var\["file_id"\]\s*=\s*(.+?);' for example.
Hope that is relax enough.

data = {
'action': 'get_info',
'curr_url': curr_url,
'file_id': file_id,
'video_url': file_id,
}
_APPS_BASE_URL = 'https://apps.hkedcity.net'
handler_url = urljoin(_APPS_BASE_URL, '/media/play/handler.php')
anthonyfok marked this conversation as resolved.
Show resolved Hide resolved

response = self._download_json(
handler_url, video_id,
data=urlencode_postdata(data),
headers=merge_dicts({'Content-Type': 'application/x-www-form-urlencoded'},
self.geo_verification_headers()))

result = try_get(response, lambda x: x['result'], dict)
anthonyfok marked this conversation as resolved.
Show resolved Hide resolved

formats = []
subtitles = {}

thumbnail_php = urljoin(_APPS_BASE_URL, result.get('image'))
thumbnail_urlh = self._downloader.urlopen(thumbnail_php)
anthonyfok marked this conversation as resolved.
Show resolved Hide resolved
thumbnail = thumbnail_urlh.geturl()

if response.get('success') and response.get('access'):
width = str_to_int(result.get('width'))
height = str_to_int(result.get('height'))
anthonyfok marked this conversation as resolved.
Show resolved Hide resolved

playlist0 = try_get(result, lambda x: x['playlist'][0], dict)
fmts = try_get(playlist0, lambda x: x['sources'], list)
anthonyfok marked this conversation as resolved.
Show resolved Hide resolved
for fmt in fmts:
label = fmt.get('label')
if label == 'HD':
h = 720
elif label == 'SD':
h = 360
w = h * width // height
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Breaks on uninitialized h. Breaks on None width and height.

Copy link
Contributor Author

@anthonyfok anthonyfok Jan 8, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! Indeed, some older videos were uploaded without width and height information, as in they are set to "0". Changed to the following:

            for fmt in fmts:
                label = fmt.get('label')
                w = None
                h = None
                if label == 'HD':
                    h = 720
                elif label == 'SD':
                    h = 360
                if h:
                    if width and height:
                        w = h * width // height
                    else:
                        w = h * 4 // 3

urlh = self._downloader.urlopen(urljoin(_APPS_BASE_URL, fmt.get('file')))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

urljoin replaced with +.

Or did you mean the use of self._downloader.urlopen()? That was intentionally done so that the download link of the video could be downloaded without geo-restriction, i.e. without proxy, and without the PHPSESSID cookie. (The geo-restriction is done within their PHP script only.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, may I keep using self._downloader.urlopen() for the MPEG-4 video file? :-)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, what for?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So that no cookie is needed with the link given by -g. The link given by urljoin(_APPS_BASE_URL, fmt.get('file')) looks something like this:

https://apps.hkedcity.net/media/mediaplayer/filename.php?t=85ee74dae3477d95b05cc3091f26ff5a&m=mp4&p=fc61b1acf92b52238ad5c616409759360b4d914f183691d79c03e27a335a1c7e99a0f5d12b7183530daad9edc79fb17d97c3b6c0d3c445d68b96c92fa88b3a2f&file=551cc229795bb70003e9962bda09899612a7ae10b183be5e84e136419d91c684c63c863a8e3237a753ed31921500c2d28aa4db43a8684e37caac000571948110&e=.mp4

which requires the correct cookies to work. It gives a "HTTP/1.1 302 Found" redirecting to a final URL that looks like https://video1.hkedcity.net/streaming/b3848d0a3296b85d67d88b749642c29a/5c3738d1/channels/etv/201507/20150716151039_989507_720.mp4 which requires no cookie. This final URL is easier for me to copy-and-paste to other programs or even to stream to other devices.

That said, I have decided to take your advice and not "fully resolve" to the final URL because the intermediate URL, with the appropriate cookies saved with --cookies, remains valid for many hours, whereas the fully resolved URL would be "410 Gone" within one or two hours. Also, I realized self._downloader.urlopen was being run twice in the for fmt in fmts: loop, leading to quite a few seconds of delay. (The HKETV server seems rather slow in responding to this request.)

formats.append({
'format_id': label,
'ext': fmt.get('type'),
'url': urlh.geturl(),
'width': w,
'height': h,
})

tracks = try_get(playlist0, lambda x: x['tracks'], list)
for track in tracks:
anthonyfok marked this conversation as resolved.
Show resolved Hide resolved
if not isinstance(track, dict):
continue
track_kind = track.get('kind')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

str_or_none.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But I copied that few lines verbatim from _parse_jwplayer_data in youtube_dl/extractor/common.py! Doesn't that mean str_or_none needs to be added there too?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed:

-                track_kind = track.get('kind')
+                track_kind = str_or_none(track.get('kind'))

if not track_kind or not isinstance(track_kind, compat_str):
continue
if track_kind.lower() not in ('captions', 'subtitles'):
continue
track_url = urljoin(_APPS_BASE_URL, track.get('file'))
if not track_url:
continue
track_label = track.get('label')
subtitles.setdefault(self._CC_LANGS.get(track_label, track_label), []).append({
'url': self._proto_relative_url(track_url),
'ext': 'srt',
})

else:
error = clean_html(response.get('access_err_msg'))
if error.find('Video streaming is not available in your country'):
anthonyfok marked this conversation as resolved.
Show resolved Hide resolved
raise GeoRestrictedError(error)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

raise_geo_restricted and specify countries.

Copy link
Contributor Author

@anthonyfok anthonyfok Jan 9, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! After consulting other extractors, I am now using this:

            self.raise_geo_restricted(msg=error, countries=self._GEO_COUNTRIES)

with the following definitions near the top:

    _GEO_BYPASS = False
    _GEO_COUNTRIES = ['HK']

else:
raise ExtractorError(error)

# Likes
emotion = self._download_json(
anthonyfok marked this conversation as resolved.
Show resolved Hide resolved
'https://emocounter.hkedcity.net/handler.php',
video_id,
data=urlencode_postdata({
'action': 'get_emotion',
'data[bucket_id]': 'etv',
'data[identifier]': video_id,
}),
headers={'Content-Type': 'application/x-www-form-urlencoded'})
if emotion.get('result'):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

emotion is not guaranteed to be a dict.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! Line removed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! Line removed.

As in

    like_count = int_or_none(try_get(emotion, lambda x: x['data']['emotion_data'][0]['count']))

alone will work just fine without the extraneous if emotion.get('result'): check.

like_count = str_to_int(try_get(emotion, lambda x: x['data']['emotion_data'][0]['count'], str))
anthonyfok marked this conversation as resolved.
Show resolved Hide resolved

return {
'id': video_id,
'title': self._html_search_meta('ed_title', webpage),
anthonyfok marked this conversation as resolved.
Show resolved Hide resolved
'description': self._html_search_meta('description', webpage, fatal=False),
'upload_date': unified_strdate(self._html_search_meta('ed_date', webpage, fatal=False), day_first=False),
'duration': int_or_none(result.get('length')),
'formats': formats,
'subtitles': subtitles,
'thumbnail': thumbnail,
'view_count': str_to_int(result.get('view_count')),
'like_count': like_count,
}