Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[threads] Add extractor (Threads.net) #9852

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions supportedsites.md
Original file line number Diff line number Diff line change
Expand Up @@ -1455,6 +1455,8 @@
- **ThisVid**
- **ThisVidMember**
- **ThisVidPlaylist**
- **Threads**
- **ThreadsIOS**: Threads' iOS `barcelona://` URL
- **ThreeSpeak**
- **ThreeSpeakUser**
- **TikTok**
Expand Down
4 changes: 4 additions & 0 deletions yt_dlp/extractor/_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2067,6 +2067,10 @@
ThisVidMemberIE,
ThisVidPlaylistIE,
)
from .threads import (
ThreadsIE,
ThreadsIOSIE,
)
from .threeqsdn import ThreeQSDNIE
from .threespeak import (
ThreeSpeakIE,
Expand Down
157 changes: 157 additions & 0 deletions yt_dlp/extractor/threads.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
from .common import InfoExtractor
from ..utils import (
remove_end,
strftime_or_none,
strip_or_none,
)
from ..utils.traversal import traverse_obj


class ThreadsIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?threads\.net/(?P<uploader>[^/]+)/post/(?P<id>[^/?#&]+)/?(?P<embed>embed.*?)?'

_TESTS = [{
'url': 'https://www.threads.net/@tntsportsbr/post/C6cqebdCfBi',
'info_dict': {
'id': 'C6cqebdCfBi',
'ext': 'mp4',
'title': 'md5:062673d04195aa2d99b8d7a11798cb9d',
'description': 'md5:fe0c73f9a892fb92efcc67cc075561b0',
'uploader': 'TNT Sports Brasil',
'uploader_id': 'tntsportsbr',
'uploader_url': 'https://www.threads.net/@tntsportsbr',
'channel': 'tntsportsbr',
'channel_url': 'https://www.threads.net/@tntsportsbr',
'timestamp': 1714613811,
'upload_date': '20240502',
'like_count': int,
'channel_is_verified': bool,
'thumbnail': r're:^https?://.*\.jpg'
}
}, {
'url': 'https://www.threads.net/@felipebecari/post/C6cM_yNPHCF',
'info_dict': {
'id': 'C6cM_yNPHCF',
'ext': 'mp4',
'title': '@felipebecari • Sobre o futuro dos dois últimos resgatados: tem muita notícia boa! 🐶❤️',
'description': 'Sobre o futuro dos dois últimos resgatados: tem muita notícia boa! 🐶❤️',
'uploader': 'Felipe Becari',
'uploader_id': 'felipebecari',
'uploader_url': 'https://www.threads.net/@felipebecari',
'channel': 'felipebecari',
'channel_url': 'https://www.threads.net/@felipebecari',
'timestamp': 1714598318,
'upload_date': '20240501',
'like_count': int,
'channel_is_verified': bool,
'thumbnail': r're:^https?://.*\.jpg'
}
}]

def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
metadata = {}

# Try getting videos from json
json_data = self._search_regex(
r'<script[^>]+>(.*"code":"%s".*)</script>' % video_id,
webpage, 'main json', fatal=True)

result = self._search_json(
r'"result":', json_data,
'result data', video_id, fatal=True)

edges = traverse_obj(result, ('data', 'data', 'edges'))

for node in edges:
items = traverse_obj(node, ('node', 'thread_items'))

for item in items:
post = item.get('post')

if post and post.get('code') == video_id:
formats = []
thumbnails = []

# Videos
if (post.get('carousel_media') is not None): # Handle multiple videos posts
media_list = post.get('carousel_media')
else:
media_list = [post]

for media in media_list:
videos = media.get('video_versions')

for video in videos:
formats.append({
'format_id': '%s-%s' % (media.get('pk'), video['type']), # id-type
'url': video['url'],
'width': media.get('original_width'),
'height': media.get('original_height'),
})

# Thumbnails
thumbs = traverse_obj(post, ('image_versions2', 'candidates'))

for thumb in thumbs:
thumbnails.append({
'url': thumb['url'],
'width': thumb['width'],
'height': thumb['height'],
})

# Metadata
metadata.setdefault('uploader_id', traverse_obj(post, ('user', 'username')))
metadata.setdefault('channel_is_verified', traverse_obj(post, ('user', 'is_verified')))
metadata.setdefault('uploader_url', 'https://www.threads.net/@%s' % traverse_obj(post, ('user', 'username')))
metadata.setdefault('timestamp', post.get('taken_at'))
metadata.setdefault('like_count', post.get('like_count'))

# Try getting metadata
metadata['id'] = video_id
metadata['title'] = strip_or_none(remove_end(self._html_extract_title(webpage), '• Threads'))
metadata['description'] = self._og_search_description(webpage)

metadata['channel'] = metadata.get('uploader_id')
metadata['channel_url'] = metadata.get('uploader_url')
metadata['uploader'] = self._search_regex(r'(.*?) \(', self._og_search_title(webpage), 'uploader')
metadata['upload_date'] = strftime_or_none(metadata.get('timestamp'))

return {
**metadata,
'formats': formats,
'thumbnails': thumbnails
}


class ThreadsIOSIE(InfoExtractor):
IE_DESC = 'IOS barcelona:// URL'
_VALID_URL = r'barcelona://media\?shortcode=(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'barcelona://media?shortcode=C6fDehepo5D',
'info_dict': {
'id': 'C6fDehepo5D',
'ext': 'mp4',
'title': 'md5:dc92f960981b8b3a33eba9681e9fdfc6',
'description': 'md5:0c36a7e67e1517459bc0334dba932164',
'uploader': 'Sa\u0303o Paulo Futebol Clube',
'uploader_id': 'saopaulofc',
'uploader_url': 'https://www.threads.net/@saopaulofc',
'channel': 'saopaulofc',
'channel_url': 'https://www.threads.net/@saopaulofc',
'timestamp': 1714694014,
'upload_date': '20240502',
'like_count': int,
'channel_is_verified': bool,
'thumbnail': r're:^https?://.*\.jpg'
},
'add_ie': ['Threads']
}]

def _real_extract(self, url):
video_id = self._match_id(url)

# Threads doesn't care about the user url, it redirects to the right one
# So we use ** instead so that we don't need to find it
return self.url_result(f'http://www.threads.net/**/post/{video_id}', ThreadsIE, video_id)
Loading