Import libray

In [32]:
from atproto import Client
import time
import json
from datetime import datetime, timezone

Log in to Bluesky account to access API function.

In [None]:
client = Client()
client.login('username', 'password')

Function for convert and normalize all posts' date formates; used for later when comparing post date and end date we set

In [34]:
def parse_date(date_str):
    '''
    Normalize all date formats to offset-aware datetime objects

    Input: 
        date_str (str): The date string to be parsed
        
    Return: the parsed date object or None if parsing fails
    '''
    try:
        # adjust time zone and convert to only date data
        return datetime.fromisoformat(date_str.replace("Z", "+00:00")).date()

    except ValueError:
        print(f"Error parsing date: {date_str}")
        return None

Scrape all post data until 1/6/2025 under search 'California fire'

In [None]:
query = "California fire" # searching term
# earliest date of scrape posts
# set end_date a day before the fire to adjust timezone convert issue
end_date = datetime(2025, 1, 6)

posts = client.app.bsky.feed.search_posts({'q': query})
cursor = posts.cursor
tw = []

while cursor:
    try:
        page = client.app.bsky.feed.search_posts({'q': query, 'cursor': cursor})
        
        for post in page.posts:
            try:
                # convert and normalize date formats
                post_date = parse_date(post.record.created_at)

                # stope processing if date exceeds 1/6/2025
                if post_date is None or post_date < end_date.date():
                    cursor = None
                    break

                thread = client.app.bsky.feed.get_post_thread({'uri': post.uri})

                # comment data store under each thread
                comment_data = []
                for reply in thread.thread.replies:
                    comment_data.append(reply.post.record.text)

                post_data = {
                    'author': post.author.handle,
                    'text': post.record.text,
                    'timestamp': post.record.created_at,
                    'likes': post.like_count,
                    'reposts': post.repost_count,
                    'comment': comment_data
                }

                tw.append(post_data)
                
            except Exception as e:
                print(f"Error processing post: {e}")
                continue
        
        if cursor:
            cursor = page.cursor
        
    except Exception as e:
        print(f"Error processing page: {e}")
        break

print("Done!")


Check our data

In [None]:
# print(tw[10:])
print(len(tw))

Output dataset into a json file

In [None]:
filename = "bluesky_posts_0309.json"

# Save to JSON file
with open(filename, 'w', encoding='utf-8') as f:
    for obj in tw:
        f.write(json.dumps(obj) + "\n")

print(f"Data saved to {filename}")