# Analyze Real World Social Media Data (Pre-Prediction/Non-Prediction Label): Bluesky

1. WEBSITE: [About Bluesky via Open Measures](https://openmeasures.io/platform)

In [1]:
import os
import sys
import json

import pandas as pd
from tqdm import tqdm

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# print(notebook_dir)
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

from data_processing import DataProcessing

In [11]:
pd.set_option('max_colwidth', 800)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

## Load Data

In [2]:
base_path = os.path.join(notebook_dir, '../', 'data/open_measures/', 'bluesky_raw_data/')
files = os.listdir(base_path)

dfs = []

for file in tqdm(files):
    full_path = os.path.join(base_path, file)
    df = DataProcessing.load_from_file(full_path, 'csv')
    dfs.append(df)

dfs

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 70.36it/s]


[     Unnamed: 0               $type                            author  \
 0             0  app.bsky.feed.post  did:plc:mcb6n67plnrlx4lg35natk2b   
 1             1  app.bsky.feed.post  did:plc:mcb6n67plnrlx4lg35natk2b   
 2             2  app.bsky.feed.post  did:plc:a2ijicgkwatuhlfss36lvwfx   
 3             3  app.bsky.feed.post  did:plc:mcb6n67plnrlx4lg35natk2b   
 4             4  app.bsky.feed.post  did:plc:mcb6n67plnrlx4lg35natk2b   
 ..          ...                 ...                               ...   
 651         651  app.bsky.feed.post  did:plc:cywy7vw3shrn7vp3ybgfrx33   
 652         652  app.bsky.feed.post  did:plc:nrr6yppar26qag7p2q3rawp7   
 653         653  app.bsky.feed.post  did:plc:m67kp6uoter7aeftq5nkzncm   
 654         654  app.bsky.feed.post  did:plc:eyaz2kbzyxmg5hgkhb3w7s25   
 655         655  app.bsky.feed.post  did:plc:5jycdvkvabnon545dxcisari   
 
                                                    cid  \
 0    zdpuAykzkHo8uZtsa4qZyQTtSePWRbHoKAJgxYwxqcrTx

## Data Processing

+ Data stored as dfs[df_1, df_2, ... n].
+ Concat all DFs to one.
+ `data_range_per_file` is the index 0 to n_1 belongs to df_1, 0_n_2 belongs to n_2, etc.

In [3]:
df = DataProcessing.concat_dfs(dfs)
df.rename(columns={"Unnamed: 0": "data_range_per_file"}, inplace=True)
df

Unnamed: 0,data_range_per_file,$type,author,cid,createdAt,embed,facets,indexedAt,sequence,text,...,langs,reply,tags,bridgyOriginalText,bridgyOriginalUrl,openmeasures_meta,rkey,rkey_str,Query Params,Site
0,0,app.bsky.feed.post,did:plc:mcb6n67plnrlx4lg35natk2b,zdpuAykzkHo8uZtsa4qZyQTtSePWRbHoKAJgxYwxqcrTxx6fi,2024-01-13T23:50:40.300023+00:00,"{'$type': 'app.bsky.embed.images', 'images': [...","[{'$type': 'app.bsky.richtext.facet', 'feature...",2024-01-13T23:50:40.805738,570168565.0,Here's the Top 10 Trending Words over the Past...,...,,,,,,,,,{'term': '(NFL OR nfl) AND (playoffs) AND (Sup...,bluesky
1,1,app.bsky.feed.post,did:plc:mcb6n67plnrlx4lg35natk2b,zdpuAt42bPRcafad5PjKbsxSXBETMXPbP7Tb8H5Mu3NV78QCw,2024-01-15T04:20:38.045294+00:00,"{'$type': 'app.bsky.embed.images', 'images': [...","[{'$type': 'app.bsky.richtext.facet', 'feature...",2024-01-15T04:26:24.470523,574404975.0,Here's the Top 10 Trending Words over the Past...,...,,,,,,,,,{'term': '(NFL OR nfl) AND (playoffs) AND (Sup...,bluesky
2,2,app.bsky.feed.post,did:plc:a2ijicgkwatuhlfss36lvwfx,bafyreibsxwnoso4kq6oltibk3xzbhqbjtlx7bmwxi73rs...,2024-01-17T18:52:28Z,"{'$type': 'app.bsky.embed.external', 'external...","[{'$type': 'app.bsky.richtext.facet', 'feature...",,,Does defense even matter in the NFL postseason...,...,,,,,,,,,{'term': '(NFL OR nfl) AND (playoffs) AND (Sup...,bluesky
3,3,app.bsky.feed.post,did:plc:mcb6n67plnrlx4lg35natk2b,zdpuAuxtMWrau1PvrZQC7mC1DokTWBdYsK4mCJQyCnySk5aWb,2024-01-21T04:30:46.145649+00:00,"{'$type': 'app.bsky.embed.images', 'images': [...","[{'$type': 'app.bsky.richtext.facet', 'feature...",2024-01-21T04:30:47.004301,596638410.0,Here's the Top 10 Trending Words over the Past...,...,,,,,,,,,{'term': '(NFL OR nfl) AND (playoffs) AND (Sup...,bluesky
4,4,app.bsky.feed.post,did:plc:mcb6n67plnrlx4lg35natk2b,zdpuArf8hkuDPBSJgJcwyhc1Du712poHWk73dRam3TuVDzgy4,2024-01-21T23:20:45.897184+00:00,"{'$type': 'app.bsky.embed.images', 'images': [...","[{'$type': 'app.bsky.richtext.facet', 'feature...",2024-01-21T23:20:46.653710,599632070.0,Here's the Top 10 Trending Words over the Past...,...,,,,,,,,,{'term': '(NFL OR nfl) AND (playoffs) AND (Sup...,bluesky
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,625,app.bsky.feed.post,did:plc:cywy7vw3shrn7vp3ybgfrx33,bafyreian7dv3sdwf35zyhpix3zknyu4c2omfo2qsbvxhs...,2025-02-08T18:53:17.000Z,"{'$type': 'app.bsky.embed.external', 'external...",,,,,...,,,,"<figure>\n <img alt=""Josh Allen, Buffalo ...",https://www.sbnation.com/golf/2025/2/8/2436163...,"{'collected_by': 'jetstream', 'last_scraped_ts...",,,{'term': '(NFL OR nfl) AND (playoffs) AND (Sup...,bluesky
4517,626,app.bsky.feed.post,did:plc:nrr6yppar26qag7p2q3rawp7,bafyreigsq3sbskb7hwcafektwr7p5i75v2gwgd7a675ii...,2025-02-08T20:00:00.000Z,"{'$type': 'app.bsky.embed.external', 'external...",,,,,...,,,['Esportes'],‚ÄúCom minha camiseta dos Eagles pendurada na po...,https://tribunaonline.com.br/esportes/taylor-s...,"{'collected_by': 'jetstream', 'last_scraped_ts...",,,{'term': '(NFL OR nfl) AND (playoffs) AND (Sup...,bluesky
4518,627,app.bsky.feed.post,did:plc:m67kp6uoter7aeftq5nkzncm,bafyreigxo3xazujg34iw32q72dphpcsen665hebk4dwsr...,2025-02-08T22:00:00.720Z,"{'$type': 'app.bsky.embed.images', 'images': [...",[{'features': [{'$type': 'app.bsky.richtext.fa...,,,The Chiefs' George Karlaftis ranks No. 1 in pr...,...,,,,,,"{'collected_by': 'jetstream', 'last_scraped_ts...",,,{'term': '(NFL OR nfl) AND (playoffs) AND (Sup...,bluesky
4519,628,app.bsky.feed.post,did:plc:eyaz2kbzyxmg5hgkhb3w7s25,bafyreigtzpa4bsolb5qkuc35735wcxgr4xsee3q46hgax...,2025-02-08T22:12:51.918Z,"{'$type': 'app.bsky.embed.external', 'external...",[{'features': [{'$type': 'app.bsky.richtext.fa...,,,www.bothteamsplayhard.wordpress.com/2025/02/08...,...,['en'],,,,,"{'collected_by': 'jetstream', 'last_scraped_ts...",,,{'term': '(NFL OR nfl) AND (playoffs) AND (Sup...,bluesky


In [4]:
df.columns

Index(['data_range_per_file', '$type', 'author', 'cid', 'createdAt', 'embed',
       'facets', 'indexedAt', 'sequence', 'text', 'uri', 'lastseents',
       'authorProfile', 'embeds', 'entities', 'labels', 'langs', 'reply',
       'tags', 'bridgyOriginalText', 'bridgyOriginalUrl', 'openmeasures_meta',
       'rkey', 'rkey_str', 'Query Params', 'Site'],
      dtype='object')

In [5]:
def user_json_to_df(df):
    """
    Users in 'account' col is stored as str'json', so need to convert and split the keys : cols and values to entries.
    
    """
    users = []
    error_processing_users = []
    
    for idx, row in df.iterrows():
        account = row['author']
        content = row['text']
        date_time = row['createdAt']
        datatype = row['$type']
        if idx <= 3:
            print(f"=======Index: {idx}=======\n\tAccount: {account}\n\tContent: {content}\n\tDate: {date_time}\n\tType of Post: {datatype}\n")
        
        try:
            # json_to_str = account.replace("'", '"') # Convert single quotes to double quotes to make it valid JSON
            # json_obj = json.loads(json_to_str) # Parse the JSON string
            # print(f"User: {type(account)}")
            user_series = pd.Series(account)
            user_series['text'] = content
            update_date_time = pd.to_datetime(date_time)
            user_series['date_with_time'] = update_date_time
            user_series['type_of_post'] = datatype
            users.append(user_series)
        except ValueError as e:
            user_to_error = (account, e)
            error_processing_users.append(user_to_error)
            continue
        except AttributeError as e:
            user_to_error = (account, e)
            error_processing_users.append(user_to_error)
            continue
            
        users_df = pd.concat(users, axis=1)
    return users_df.T, error_processing_users

In [6]:
user_account_info_df, error_processing_users = user_json_to_df(df)
user_account_info_df

	Account: did:plc:mcb6n67plnrlx4lg35natk2b
	Content: Here's the Top 10 Trending Words over the Past 10 Minutes:
Ô∏è‚ö†Ô∏èüåÄ2 - flacco
Ô∏è‚ö†Ô∏èüåÄ1 - browns
3rd - texans
4th - elite
5th - cleveland
6th - stroud
7th - houston
8th - francisco
9th - joe
10th - freeze
(tap/click to see all posts with that word!)

#FreePalestine üáµüá∏
	Date: 2024-01-13T23:50:40.300023+00:00
	Type of Post: app.bsky.feed.post

	Account: did:plc:mcb6n67plnrlx4lg35natk2b
	Content: Here's the Top 10 Trending Words over the Past 10 Minutes:
Ô∏è‚ö†Ô∏èüåÄ3 - lions
Ô∏è‚ö†Ô∏èüåÄ1 - detroit
Ô∏è‚ö†Ô∏èüåÄ1 - playoff
4th - cowboys
5th - playoffs
6th - nfc
7th - detective
8th - rams
9th - packers
10th - eagles
(tap/click to see all posts with that word!)

#FreePalestine üáµüá∏
	Date: 2024-01-15T04:20:38.045294+00:00
	Type of Post: app.bsky.feed.post

	Account: did:plc:a2ijicgkwatuhlfss36lvwfx
	Content: Does defense even matter in the NFL postseason? I look at the last five years @TheAthleticFS https://theathlet

Unnamed: 0,0,text,date_with_time,type_of_post
0,did:plc:mcb6n67plnrlx4lg35natk2b,Here's the Top 10 Trending Words over the Past...,2024-01-13 23:50:40.300023+00:00,app.bsky.feed.post
1,did:plc:mcb6n67plnrlx4lg35natk2b,Here's the Top 10 Trending Words over the Past...,2024-01-15 04:20:38.045294+00:00,app.bsky.feed.post
2,did:plc:a2ijicgkwatuhlfss36lvwfx,Does defense even matter in the NFL postseason...,2024-01-17 18:52:28+00:00,app.bsky.feed.post
3,did:plc:mcb6n67plnrlx4lg35natk2b,Here's the Top 10 Trending Words over the Past...,2024-01-21 04:30:46.145649+00:00,app.bsky.feed.post
4,did:plc:mcb6n67plnrlx4lg35natk2b,Here's the Top 10 Trending Words over the Past...,2024-01-21 23:20:45.897184+00:00,app.bsky.feed.post
...,...,...,...,...
4516,did:plc:cywy7vw3shrn7vp3ybgfrx33,,2025-02-08 18:53:17+00:00,app.bsky.feed.post
4517,did:plc:nrr6yppar26qag7p2q3rawp7,,2025-02-08 20:00:00+00:00,app.bsky.feed.post
4518,did:plc:m67kp6uoter7aeftq5nkzncm,The Chiefs' George Karlaftis ranks No. 1 in pr...,2025-02-08 22:00:00.720000+00:00,app.bsky.feed.post
4519,did:plc:eyaz2kbzyxmg5hgkhb3w7s25,www.bothteamsplayhard.wordpress.com/2025/02/08...,2025-02-08 22:12:51.918000+00:00,app.bsky.feed.post


In [12]:
user_account_info_df

Unnamed: 0,0,text,date_with_time,type_of_post
0,did:plc:mcb6n67plnrlx4lg35natk2b,Here's the Top 10 Trending Words over the Past 10 Minutes:\nÔ∏è‚ö†Ô∏èüåÄ2 - flacco\nÔ∏è‚ö†Ô∏èüåÄ1 - browns\n3rd - texans\n4th - elite\n5th - cleveland\n6th - stroud\n7th - houston\n8th - francisco\n9th - joe\n10th - freeze\n(tap/click to see all posts with that word!)\n\n#FreePalestine üáµüá∏,2024-01-13 23:50:40.300023+00:00,app.bsky.feed.post
1,did:plc:mcb6n67plnrlx4lg35natk2b,Here's the Top 10 Trending Words over the Past 10 Minutes:\nÔ∏è‚ö†Ô∏èüåÄ3 - lions\nÔ∏è‚ö†Ô∏èüåÄ1 - detroit\nÔ∏è‚ö†Ô∏èüåÄ1 - playoff\n4th - cowboys\n5th - playoffs\n6th - nfc\n7th - detective\n8th - rams\n9th - packers\n10th - eagles\n(tap/click to see all posts with that word!)\n\n#FreePalestine üáµüá∏,2024-01-15 04:20:38.045294+00:00,app.bsky.feed.post
2,did:plc:a2ijicgkwatuhlfss36lvwfx,Does defense even matter in the NFL postseason? I look at the last five years @TheAthleticFS https://theathletic.com/5206130/2024/01/17/offense-defense-performance-nfl-playoffs/,2024-01-17 18:52:28+00:00,app.bsky.feed.post
3,did:plc:mcb6n67plnrlx4lg35natk2b,Here's the Top 10 Trending Words over the Past 10 Minutes:\nÔ∏è‚ö†Ô∏èüåÄ2 - packers\nÔ∏è‚ö†Ô∏èüåÄ1 - 49ers\nÔ∏è‚ö†Ô∏èüåÄ1 - niners\n4th - lions\n5th - nfc\n6th - bay\n7th - „ÅäÊòº\n8th - purdy\n9th - championship\n10th - favre\n(tap/click to see all posts with that word!)\n\n#FreePalestine üáµüá∏,2024-01-21 04:30:46.145649+00:00,app.bsky.feed.post
4,did:plc:mcb6n67plnrlx4lg35natk2b,Here's the Top 10 Trending Words over the Past 10 Minutes:\nÔ∏è‚ö†Ô∏èüåÄ7 - lions\nÔ∏è‚ö†Ô∏èüåÄ3 - detroit\nÔ∏è‚ö†Ô∏èüåÄ1 - baker\nÔ∏è‚ö†Ô∏èüåÄ1 - desantis\nÔ∏è‚ö†Ô∏èüåÄ1 - championship\nÔ∏è‚ö†Ô∏èüåÄ1 - nfc\n7th - mayfield\n8th - loins\n9th - ron\n10th - playoff\n(tap/click to see all posts with that word!)\n\n#FreePalestine üáµüá∏,2024-01-21 23:20:45.897184+00:00,app.bsky.feed.post
...,...,...,...,...
4516,did:plc:cywy7vw3shrn7vp3ybgfrx33,,2025-02-08 18:53:17+00:00,app.bsky.feed.post
4517,did:plc:nrr6yppar26qag7p2q3rawp7,,2025-02-08 20:00:00+00:00,app.bsky.feed.post
4518,did:plc:m67kp6uoter7aeftq5nkzncm,The Chiefs' George Karlaftis ranks No. 1 in pressure rate vs. adjusted sack rate among defensive linemen with 30+ pass rush snaps in the NFL playoffs.\n\n Learn how he's a Super Bowl X-factor: theanalyst.com/na/2025/02/su...,2025-02-08 22:00:00.720000+00:00,app.bsky.feed.post
4519,did:plc:eyaz2kbzyxmg5hgkhb3w7s25,www.bothteamsplayhard.wordpress.com/2025/02/08/s... Here's my preview & pick for Super Bowl LIX between the Kansas City Chiefs & Philadelphia Eagles.\n\nThis post includes:\n\nConference Title Recap.\n\nKeys To Victory.\n\nFacts.\n\nClassic NFL Highlights.\n\nPick.\n\n9-3 in the Playoffs in my picks.\n\nRead it.,2025-02-08 22:12:51.918000+00:00,app.bsky.feed.post


In [7]:
error_processing_users

[]

In [10]:
user_account_info_df.type_of_post.unique()

array(['app.bsky.feed.post'], dtype=object)