Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[extractor/wykop] Add extractors (#6140)
Authored by: selfisekai
- Loading branch information
1 parent
fc5a7f9
commit aed945e
Showing
2 changed files
with
274 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,268 @@ | ||
import json | ||
import urllib.error | ||
|
||
from .common import InfoExtractor | ||
from ..utils import ( | ||
ExtractorError, | ||
format_field, | ||
parse_iso8601, | ||
traverse_obj, | ||
url_or_none, | ||
) | ||
|
||
|
||
class WykopBaseExtractor(InfoExtractor): | ||
def _get_token(self, force_refresh=False): | ||
if not force_refresh: | ||
maybe_cached = self.cache.load('wykop', 'bearer') | ||
if maybe_cached: | ||
return maybe_cached | ||
|
||
new_token = traverse_obj( | ||
self._do_call_api('auth', None, 'Downloading anonymous auth token', data={ | ||
# hardcoded in frontend | ||
'key': 'w53947240748', | ||
'secret': 'd537d9e0a7adc1510842059ae5316419', | ||
}), ('data', 'token')) | ||
|
||
self.cache.store('wykop', 'bearer', new_token) | ||
return new_token | ||
|
||
def _do_call_api(self, path, video_id, note='Downloading JSON metadata', data=None, headers={}): | ||
if data: | ||
data = json.dumps({'data': data}).encode() | ||
headers['Content-Type'] = 'application/json' | ||
|
||
return self._download_json( | ||
f'https://wykop.pl/api/v3/{path}', video_id, | ||
note=note, data=data, headers=headers) | ||
|
||
def _call_api(self, path, video_id, note='Downloading JSON metadata'): | ||
token = self._get_token() | ||
for retrying in range(2): | ||
try: | ||
return self._do_call_api(path, video_id, note, headers={'Authorization': f'Bearer {token}'}) | ||
except ExtractorError as e: | ||
if not retrying and isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 403: | ||
token = self._get_token(True) | ||
continue | ||
raise | ||
|
||
def _common_data_extract(self, data): | ||
author = traverse_obj(data, ('author', 'username'), expected_type=str) | ||
|
||
return { | ||
'_type': 'url_transparent', | ||
'display_id': data.get('slug'), | ||
'url': traverse_obj(data, | ||
('media', 'embed', 'url'), # what gets an iframe embed | ||
('source', 'url'), # clickable url (dig only) | ||
expected_type=url_or_none), | ||
'thumbnail': traverse_obj( | ||
data, ('media', 'photo', 'url'), ('media', 'embed', 'thumbnail'), expected_type=url_or_none), | ||
'uploader': author, | ||
'uploader_id': author, | ||
'uploader_url': format_field(author, None, 'https://wykop.pl/ludzie/%s'), | ||
'timestamp': parse_iso8601(data.get('created_at'), delimiter=' '), # time it got submitted | ||
'like_count': traverse_obj(data, ('votes', 'up'), expected_type=int), | ||
'dislike_count': traverse_obj(data, ('votes', 'down'), expected_type=int), | ||
'comment_count': traverse_obj(data, ('comments', 'count'), expected_type=int), | ||
'age_limit': 18 if data.get('adult') else 0, | ||
'tags': data.get('tags'), | ||
} | ||
|
||
|
||
class WykopDigIE(WykopBaseExtractor): | ||
IE_NAME = 'wykop:dig' | ||
_VALID_URL = r'https?://(?:www\.)?wykop\.pl/link/(?P<id>\d+)' | ||
|
||
_TESTS = [{ | ||
'url': 'https://wykop.pl/link/6912923/najbardziej-zrzedliwy-kot-na-swiecie-i-frozen-planet-ii-i-bbc-earth', | ||
'info_dict': { | ||
'id': 'rlSTBvViflc', | ||
'ext': 'mp4', | ||
'title': 'Najbardziej zrzędliwy kot na świecie I Frozen Planet II I BBC Earth', | ||
'display_id': 'najbardziej-zrzedliwy-kot-na-swiecie-i-frozen-planet-ii-i-bbc-earth', | ||
'description': 'md5:ac0f87dea1cdcb6b0c53f3612a095c87', | ||
'tags': ['zwierzaczki', 'koty', 'smiesznykotek', 'humor', 'rozrywka', 'ciekawostki'], | ||
'age_limit': 0, | ||
'timestamp': 1669154480, | ||
'release_timestamp': 1669194241, | ||
'release_date': '20221123', | ||
'uploader': 'starnak', | ||
'uploader_id': 'starnak', | ||
'uploader_url': 'https://wykop.pl/ludzie/starnak', | ||
'like_count': int, | ||
'dislike_count': int, | ||
'comment_count': int, | ||
'thumbnail': r're:https?://wykop\.pl/cdn/.+', | ||
'view_count': int, | ||
'channel': 'BBC Earth', | ||
'channel_id': 'UCwmZiChSryoWQCZMIQezgTg', | ||
'channel_url': 'https://www.youtube.com/channel/UCwmZiChSryoWQCZMIQezgTg', | ||
'categories': ['Pets & Animals'], | ||
'upload_date': '20220923', | ||
'duration': 191, | ||
'channel_follower_count': int, | ||
'availability': 'public', | ||
'live_status': 'not_live', | ||
'playable_in_embed': True, | ||
}, | ||
}] | ||
|
||
@classmethod | ||
def suitable(cls, url): | ||
return cls._match_valid_url(url) and not WykopDigCommentIE.suitable(url) | ||
|
||
def _real_extract(self, url): | ||
video_id = self._match_id(url) | ||
data = self._call_api(f'links/{video_id}', video_id)['data'] | ||
|
||
return { | ||
**self._common_data_extract(data), | ||
'id': video_id, | ||
'title': data['title'], | ||
'description': data.get('description'), | ||
# time it got "digged" to the homepage | ||
'release_timestamp': parse_iso8601(data.get('published_at'), delimiter=' '), | ||
} | ||
|
||
|
||
class WykopDigCommentIE(WykopBaseExtractor): | ||
IE_NAME = 'wykop:dig:comment' | ||
_VALID_URL = r'https?://(?:www\.)?wykop\.pl/link/(?P<dig_id>\d+)/[^/]+/komentarz/(?P<id>\d+)' | ||
|
||
_TESTS = [{ | ||
'url': 'https://wykop.pl/link/6992589/strollowal-oszusta-przez-ponad-24-minuty-udawal-naiwniaka-i-nagral-rozmowe/komentarz/114540527/podobna-sytuacja-ponizej-ciekawa-dyskusja-z-oszustem-na-sam-koniec-sam-bylem-w-biurze-swiadkiem-podobnej-rozmowy-niemal-zakonczonej-sukcesem-bandyty-g', | ||
'info_dict': { | ||
'id': 'u6tEi2FmKZY', | ||
'ext': 'mp4', | ||
'title': 'md5:e7c741c5baa7ed6478000caf72865577', | ||
'display_id': 'md5:45b2d12bd0e262d09cc7cf7abc8412db', | ||
'description': 'md5:bcec7983429f9c0630f9deb9d3d1ba5e', | ||
'timestamp': 1674476945, | ||
'uploader': 'Bartholomew', | ||
'uploader_id': 'Bartholomew', | ||
'uploader_url': 'https://wykop.pl/ludzie/Bartholomew', | ||
'thumbnail': r're:https?://wykop\.pl/cdn/.+', | ||
'tags': [], | ||
'availability': 'public', | ||
'duration': 1838, | ||
'upload_date': '20230117', | ||
'categories': ['Entertainment'], | ||
'view_count': int, | ||
'like_count': int, | ||
'dislike_count': int, | ||
'comment_count': int, | ||
'channel_follower_count': int, | ||
'playable_in_embed': True, | ||
'live_status': 'not_live', | ||
'age_limit': 0, | ||
'chapters': 'count:3', | ||
'channel': 'Poszukiwacze Okazji', | ||
'channel_id': 'UCzzvJDZThwv06dR4xmzrZBw', | ||
'channel_url': 'https://www.youtube.com/channel/UCzzvJDZThwv06dR4xmzrZBw', | ||
}, | ||
}] | ||
|
||
def _real_extract(self, url): | ||
dig_id, comment_id = self._search_regex(self._VALID_URL, url, 'dig and comment ids', group=('dig_id', 'id')) | ||
data = self._call_api(f'links/{dig_id}/comments/{comment_id}', comment_id)['data'] | ||
|
||
return { | ||
**self._common_data_extract(data), | ||
'id': comment_id, | ||
'title': f"{traverse_obj(data, ('author', 'username'))} - {data.get('content') or ''}", | ||
'description': data.get('content'), | ||
} | ||
|
||
|
||
class WykopPostIE(WykopBaseExtractor): | ||
IE_NAME = 'wykop:post' | ||
_VALID_URL = r'https?://(?:www\.)?wykop\.pl/wpis/(?P<id>\d+)' | ||
|
||
_TESTS = [{ | ||
'url': 'https://wykop.pl/wpis/68893343/kot-koty-smiesznykotek', | ||
'info_dict': { | ||
'id': 'PL8JMjiUPHUhwc9ZlKa_5IFeBwBV8Xe7jI', | ||
'title': 'PawelW124 - #kot #koty #smiesznykotek', | ||
'description': '#kot #koty #smiesznykotek', | ||
'display_id': 'kot-koty-smiesznykotek', | ||
'tags': ['kot', 'koty', 'smiesznykotek'], | ||
'uploader': 'PawelW124', | ||
'uploader_id': 'PawelW124', | ||
'uploader_url': 'https://wykop.pl/ludzie/PawelW124', | ||
'timestamp': 1668938142, | ||
'age_limit': 0, | ||
'like_count': int, | ||
'dislike_count': int, | ||
'thumbnail': r're:https?://wykop\.pl/cdn/.+', | ||
'comment_count': int, | ||
'channel': 'Revan', | ||
'channel_id': 'UCW9T_-uZoiI7ROARQdTDyOw', | ||
'channel_url': 'https://www.youtube.com/channel/UCW9T_-uZoiI7ROARQdTDyOw', | ||
'upload_date': '20221120', | ||
'modified_date': '20220814', | ||
'availability': 'public', | ||
'view_count': int, | ||
}, | ||
'playlist_mincount': 15, | ||
'params': { | ||
'flat_playlist': True, | ||
} | ||
}] | ||
|
||
@classmethod | ||
def suitable(cls, url): | ||
return cls._match_valid_url(url) and not WykopPostCommentIE.suitable(url) | ||
|
||
def _real_extract(self, url): | ||
video_id = self._match_id(url) | ||
data = self._call_api(f'entries/{video_id}', video_id)['data'] | ||
|
||
return { | ||
**self._common_data_extract(data), | ||
'id': video_id, | ||
'title': f"{traverse_obj(data, ('author', 'username'))} - {data.get('content') or ''}", | ||
'description': data.get('content'), | ||
} | ||
|
||
|
||
class WykopPostCommentIE(WykopBaseExtractor): | ||
IE_NAME = 'wykop:post:comment' | ||
_VALID_URL = r'https?://(?:www\.)?wykop\.pl/wpis/(?P<post_id>\d+)/[^/#]+#(?P<id>\d+)' | ||
|
||
_TESTS = [{ | ||
'url': 'https://wykop.pl/wpis/70084873/test-test-test#249303979', | ||
'info_dict': { | ||
'id': 'confusedquickarmyant', | ||
'ext': 'mp4', | ||
'title': 'tpap - treść komentarza', | ||
'display_id': 'tresc-komentarza', | ||
'description': 'treść komentarza', | ||
'uploader': 'tpap', | ||
'uploader_id': 'tpap', | ||
'uploader_url': 'https://wykop.pl/ludzie/tpap', | ||
'timestamp': 1675349470, | ||
'upload_date': '20230202', | ||
'tags': [], | ||
'duration': 2.12, | ||
'age_limit': 0, | ||
'categories': [], | ||
'view_count': int, | ||
'like_count': int, | ||
'dislike_count': int, | ||
'thumbnail': r're:https?://wykop\.pl/cdn/.+', | ||
}, | ||
}] | ||
|
||
def _real_extract(self, url): | ||
post_id, comment_id = self._search_regex(self._VALID_URL, url, 'post and comment ids', group=('post_id', 'id')) | ||
data = self._call_api(f'entries/{post_id}/comments/{comment_id}', comment_id)['data'] | ||
|
||
return { | ||
**self._common_data_extract(data), | ||
'id': comment_id, | ||
'title': f"{traverse_obj(data, ('author', 'username'))} - {data.get('content') or ''}", | ||
'description': data.get('content'), | ||
} |