yt-dlp · bashonly · Jul 29, 2023 · Jul 17, 2023 · Jul 17, 2023 · Jul 19, 2023
diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py
@@ -1,3 +1,4 @@
+import functools
 import json
 import re
 
@@ -279,6 +280,12 @@ def input_dict(subtask_id, text):
                     'Submitting confirmation code', headers, data=build_login_json(input_dict(
                         next_subtask, self._get_tfa_info('confirmation code sent to your email or phone'))))
 
+            elif next_subtask == 'ArkoseLogin':
+                self.raise_login_required('Twitter is requiring captcha for this login attempt', method='cookies')
+
+            elif next_subtask == 'DenyLoginSubtask':
+                self.raise_login_required('Twitter rejected this login attempt as suspicious', method='cookies')
+
             elif next_subtask == 'LoginSuccessSubtask':
                 raise ExtractorError('Twitter API did not grant auth token cookie')
 
@@ -304,8 +311,9 @@ def _call_api(self, path, video_id, query={}, graphql=False):
 
         if result.get('errors'):
             errors = ', '.join(set(traverse_obj(result, ('errors', ..., 'message', {str}))))
-            raise ExtractorError(
-                f'Error(s) while querying API: {errors or "Unknown error"}', expected=True)
+            if errors and 'not authorized' in errors:
+                self.raise_login_required(remove_end(errors, '.'))
+            raise ExtractorError(f'Error(s) while querying API: {errors or "Unknown error"}')
 
         return result
 
@@ -607,7 +615,7 @@ class TwitterIE(TwitterBaseIE):
         # has mp4 formats via mobile API
         'url': 'https://twitter.com/news_al3alm/status/852138619213144067',
         'info_dict': {
-            'id': '852138619213144067',
+            'id': '852077943283097602',
             'ext': 'mp4',
             'title': 'عالم الأخبار - كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة',
             'description': 'كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة   https://t.co/xg6OhpyKfN',
@@ -616,8 +624,16 @@ class TwitterIE(TwitterBaseIE):
             'duration': 277.4,
             'timestamp': 1492000653,
             'upload_date': '20170412',
+            'display_id': '852138619213144067',
+            'age_limit': 0,
+            'uploader_url': 'https://twitter.com/news_al3alm',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'tags': [],
+            'repost_count': int,
+            'view_count': int,
+            'like_count': int,
+            'comment_count': int,
         },
-        'skip': 'Account suspended',
     }, {
         'url': 'https://twitter.com/i/web/status/910031516746514432',
         'info_dict': {
@@ -1024,6 +1040,7 @@ class TwitterIE(TwitterBaseIE):
             'repost_count': int,
         },
         'params': {'extractor_args': {'twitter': {'legacy_api': ['']}}},
+        'skip': 'Protected tweet',
     }, {
         # orig tweet w/ graphql
         'url': 'https://twitter.com/liberdalau/status/1623739803874349067',
@@ -1047,6 +1064,7 @@ class TwitterIE(TwitterBaseIE):
             'repost_count': int,
             'comment_count': int,
         },
+        'skip': 'Protected tweet',
     }, {
         # onion route
         'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273',
@@ -1103,6 +1121,8 @@ def _graphql_to_legacy(self, data, twid):
             reason = result.get('reason')
             if reason == 'NsfwLoggedOut':
                 self.raise_login_required('NSFW tweet requires authentication')
+            elif reason == 'Protected':
+                self.raise_login_required('You are not authorized to view this protected tweet')
             raise ExtractorError(reason or 'Requested tweet is unavailable', expected=True)
 
         status = result.get('legacy', {})
@@ -1187,22 +1207,39 @@ def _build_graphql_query(self, media_id):
             }
         }
 
+    def _extract_status(self, twid):
+        if self.is_logged_in:
+            return self._graphql_to_legacy(
+                self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid), twid)
+
+        try:
+            if self._configuration_arg('legacy_api'):
+                status = traverse_obj(self._call_api(f'statuses/show/{twid}.json', twid, {
+                    'cards_platform': 'Web-12',
+                    'include_cards': 1,
+                    'include_reply_count': 1,
+                    'include_user_entities': 0,
+                    'tweet_mode': 'extended',
+                }), 'retweeted_status', None)
+            else:
+                status = self._graphql_to_legacy(
+                    self._call_graphql_api('2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId', twid), twid)
+
+        except ExtractorError as e:
+            if e.expected:
+                raise
+            self.report_warning(
+                f'{e.orig_msg}. Falling back to syndication endpoint; some metadata may be missing', twid)
+            status = self._download_json(
+                'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON',
+                headers={'User-Agent': 'Googlebot'}, query={'id': twid})
+            status['extended_entities'] = {'media': status.get('mediaDetails')}
+
+        return status
+
     def _real_extract(self, url):
         twid, selected_index = self._match_valid_url(url).group('id', 'index')
-        if not self.is_logged_in and self._configuration_arg('legacy_api'):
-            status = traverse_obj(self._call_api(f'statuses/show/{twid}.json', twid, {
-                'cards_platform': 'Web-12',
-                'include_cards': 1,
-                'include_reply_count': 1,
-                'include_user_entities': 0,
-                'tweet_mode': 'extended',
-            }), 'retweeted_status', None)
-        elif not self.is_logged_in:
-            status = self._graphql_to_legacy(
-                self._call_graphql_api('2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId', twid), twid)
-        else:
-            status = self._graphql_to_legacy(
-                self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid), twid)
+        status = self._extract_status(twid)
 
         title = description = traverse_obj(
             status, (('full_text', 'text'), {lambda x: x.replace('\n', ' ')}), get_all=False) or ''
@@ -1230,7 +1267,10 @@ def _real_extract(self, url):
         }
 
         def extract_from_video_info(media):
-            media_id = traverse_obj(media, 'id_str', 'id', expected_type=str_or_none)
+            media_id = traverse_obj(media, 'id_str', 'id', (
+                'video_info', 'variants', ..., 'url',
+                {functools.partial(re.search, r'_video/(\d+)/')}, 1
+            ), get_all=False, expected_type=str_or_none) or twid
             self.write_debug(f'Extracting from video info: {media_id}')
 
             formats = []