Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ie/twitter] Add fallback, improve error handling #7621

Merged
merged 16 commits into from
Jul 29, 2023
Merged
78 changes: 59 additions & 19 deletions yt_dlp/extractor/twitter.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import functools
import json
import re

Expand Down Expand Up @@ -279,6 +280,12 @@ def input_dict(subtask_id, text):
'Submitting confirmation code', headers, data=build_login_json(input_dict(
next_subtask, self._get_tfa_info('confirmation code sent to your email or phone'))))

elif next_subtask == 'ArkoseLogin':
self.raise_login_required('Twitter is requiring captcha for this login attempt', method='cookies')

elif next_subtask == 'DenyLoginSubtask':
self.raise_login_required('Twitter rejected this login attempt as suspicious', method='cookies')

elif next_subtask == 'LoginSuccessSubtask':
raise ExtractorError('Twitter API did not grant auth token cookie')

Expand All @@ -304,8 +311,9 @@ def _call_api(self, path, video_id, query={}, graphql=False):

if result.get('errors'):
errors = ', '.join(set(traverse_obj(result, ('errors', ..., 'message', {str}))))
raise ExtractorError(
f'Error(s) while querying API: {errors or "Unknown error"}', expected=True)
if errors and 'not authorized' in errors:
self.raise_login_required(remove_end(errors, '.'))
raise ExtractorError(f'Error(s) while querying API: {errors or "Unknown error"}')

return result

Expand Down Expand Up @@ -607,7 +615,7 @@ class TwitterIE(TwitterBaseIE):
# has mp4 formats via mobile API
'url': 'https://twitter.com/news_al3alm/status/852138619213144067',
'info_dict': {
'id': '852138619213144067',
'id': '852077943283097602',
pukkandan marked this conversation as resolved.
Show resolved Hide resolved
'ext': 'mp4',
'title': 'عالم الأخبار - كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة',
'description': 'كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN',
Expand All @@ -616,8 +624,16 @@ class TwitterIE(TwitterBaseIE):
'duration': 277.4,
'timestamp': 1492000653,
'upload_date': '20170412',
'display_id': '852138619213144067',
'age_limit': 0,
'uploader_url': 'https://twitter.com/news_al3alm',
'thumbnail': r're:^https?://.*\.jpg',
'tags': [],
'repost_count': int,
'view_count': int,
'like_count': int,
'comment_count': int,
},
'skip': 'Account suspended',
}, {
'url': 'https://twitter.com/i/web/status/910031516746514432',
'info_dict': {
Expand Down Expand Up @@ -1024,6 +1040,7 @@ class TwitterIE(TwitterBaseIE):
'repost_count': int,
},
'params': {'extractor_args': {'twitter': {'legacy_api': ['']}}},
'skip': 'Protected tweet',
}, {
# orig tweet w/ graphql
'url': 'https://twitter.com/liberdalau/status/1623739803874349067',
Expand All @@ -1047,6 +1064,7 @@ class TwitterIE(TwitterBaseIE):
'repost_count': int,
'comment_count': int,
},
'skip': 'Protected tweet',
}, {
# onion route
'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273',
Expand Down Expand Up @@ -1103,6 +1121,8 @@ def _graphql_to_legacy(self, data, twid):
reason = result.get('reason')
if reason == 'NsfwLoggedOut':
self.raise_login_required('NSFW tweet requires authentication')
elif reason == 'Protected':
self.raise_login_required('You are not authorized to view this protected tweet')
raise ExtractorError(reason or 'Requested tweet is unavailable', expected=True)

status = result.get('legacy', {})
Expand Down Expand Up @@ -1187,22 +1207,39 @@ def _build_graphql_query(self, media_id):
}
}

def _extract_status(self, twid):
if self.is_logged_in:
return self._graphql_to_legacy(
self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid), twid)

try:
if self._configuration_arg('legacy_api'):
status = traverse_obj(self._call_api(f'statuses/show/{twid}.json', twid, {
'cards_platform': 'Web-12',
'include_cards': 1,
'include_reply_count': 1,
'include_user_entities': 0,
'tweet_mode': 'extended',
}), 'retweeted_status', None)
else:
status = self._graphql_to_legacy(
self._call_graphql_api('2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId', twid), twid)
pukkandan marked this conversation as resolved.
Show resolved Hide resolved

except ExtractorError as e:
if e.expected:
raise
self.report_warning(
f'{e.orig_msg}. Falling back to syndication endpoint; some metadata may be missing', twid)
status = self._download_json(
'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON',
headers={'User-Agent': 'Googlebot'}, query={'id': twid})
status['extended_entities'] = {'media': status.get('mediaDetails')}

return status
pukkandan marked this conversation as resolved.
Show resolved Hide resolved

def _real_extract(self, url):
twid, selected_index = self._match_valid_url(url).group('id', 'index')
if not self.is_logged_in and self._configuration_arg('legacy_api'):
status = traverse_obj(self._call_api(f'statuses/show/{twid}.json', twid, {
'cards_platform': 'Web-12',
'include_cards': 1,
'include_reply_count': 1,
'include_user_entities': 0,
'tweet_mode': 'extended',
}), 'retweeted_status', None)
elif not self.is_logged_in:
status = self._graphql_to_legacy(
self._call_graphql_api('2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId', twid), twid)
else:
status = self._graphql_to_legacy(
self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid), twid)
status = self._extract_status(twid)

title = description = traverse_obj(
status, (('full_text', 'text'), {lambda x: x.replace('\n', ' ')}), get_all=False) or ''
Expand Down Expand Up @@ -1230,7 +1267,10 @@ def _real_extract(self, url):
}

def extract_from_video_info(media):
media_id = traverse_obj(media, 'id_str', 'id', expected_type=str_or_none)
media_id = traverse_obj(media, 'id_str', 'id', (
'video_info', 'variants', ..., 'url',
{functools.partial(re.search, r'_video/(\d+)/')}, 1
), get_all=False, expected_type=str_or_none) or twid
self.write_debug(f'Extracting from video info: {media_id}')

formats = []
Expand Down