In [1]:
pip install boto3

Collecting boto3
  Downloading boto3-1.36.25-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<1.37.0,>=1.36.25 (from boto3)
  Downloading botocore-1.36.25-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.12.0,>=0.11.0 (from boto3)
  Downloading s3transfer-0.11.2-py3-none-any.whl.metadata (1.7 kB)
Downloading boto3-1.36.25-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.2/139.2 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.36.25-py3-none-any.whl (13.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Downloading s3transfer-0.11.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.2/84.2 kB[0m [31m5.6 MB/s[0m eta [36m0:0

In [2]:
# import colab secrets to store login credentials
from google.colab import userdata

# aws stuff
import boto3
from botocore.exceptions import ClientError

# json necessary to parse secret string, and write/read s3 objects
import json

# datetime is necessary for our ddb and s3 schema
import datetime
import zoneinfo

# for logging
import base64
import requests

In [3]:
AWS_KEY = userdata.get('aws_access_key')
AWS_SECRET_KEY = userdata.get('aws_secret_access_key')
REGION = userdata.get('aws_region')
SECRETS_ID = userdata.get('aws_secretsmanager_id')

DDB = 'dynamodb'
S3 = 's3'
DDB_TABLE = 'rickybot-ddb'
S3_BUCKET = 'rickybot-s3'

PRIMARY_KEY = 'DOW' # the dynamodb table's primary key. there is no sort key
DOW_KEYS = {
    'Sunday': 'SUN',
    'Monday': 'MON',
    'Tuesday': 'TUE',
    'Wednesday': 'WED',
    'Thursday': 'THU',
    'Friday': 'FRI+SAT',
    'Saturday': 'FRI+SAT'
}
USER_TIMEZONE = "US/Eastern"

FILE_PATH = "LOGGING_AGG_02.txt"
BRANCH = "main"

In [4]:
# get the day of the week so we know what dynamodb key to pull from and which bucket to aggregate to
# doing this first because we do not run this on saturday and can bail out early if we get into this code for some reason -- no longer skipping saturdays, so that we can get fridays follows out of the dynamodb
# also we are running this at about 1am, the following day after all runs have concluded for the previous. so we're aggregating the previous day's results
cur_timestamp = datetime.datetime.now(zoneinfo.ZoneInfo(USER_TIMEZONE))
yest_timestamp = cur_timestamp - datetime.timedelta(days=1)
yesterday = yest_timestamp.strftime("%A")

# use the day of the week to pull up the corresponding key for our dynamodb entries and our s3 bucket
ddbs3_key = DOW_KEYS[yesterday]
print(ddbs3_key)

THU


In [5]:
# connect to aws
try:
  aws_session = boto3.Session(
          aws_access_key_id = AWS_KEY,
          aws_secret_access_key = AWS_SECRET_KEY,
          region_name = REGION
      )
except:
  print('failed to begin AWS session')
  # return with error
  # this is the only error that we can't log to github, because we never got the credentials

In [6]:
# then connect to secrets manager
try:
  secrets_client = aws_session.client('secretsmanager')
  secret_value = secrets_client.get_secret_value(SecretId=SECRETS_ID)
  secret_string = secret_value['SecretString']
  secret_map = json.loads(secret_string)
except:
  print('failed to reach aws secrets manager')
  # return with error

In [7]:
# create constants from the values in the secrets manager
BSKY_USERNAME = secret_map['bsky_username']
BSKY_PASS = secret_map['bsky_password']
GITHUB_TOKEN = secret_map['github_token']
GITHUB_REPO = secret_map['github_user/repo']
HUGGING_TOKEN = secret_map['hugging_token']

In [8]:
# before the program starts let's set up the logging function so we can insert it at any point where our program could break
def logging_aggregator(logging_text):
  # LOGGING ALL THE CHANGES TO OUR LOGGING FILE IN GITHUB
  datetime_object = datetime.datetime.fromtimestamp(cur_timestamp.timestamp())
  date_only = str(datetime_object.date())
  commit_message = "Logging follow aggregation on " + date_only


  # Step 1: Get the file's current content and SHA
  url = f"https://api.github.com/repos/{GITHUB_REPO}/contents/{FILE_PATH}"
  headers = {"Authorization": f"token {GITHUB_TOKEN}"}
  response = requests.get(url, headers=headers)
  response_json = response.json()

  # Decode the content of the file
  file_sha = response_json["sha"]
  content = base64.b64decode(response_json["content"]).decode("utf-8")

  # Step 2: Modify the file content
  new_content = content + date_only + ': ' + logging_text + '\n'
  encoded_content = base64.b64encode(new_content.encode("utf-8")).decode("utf-8")

  # Step 3: Push the updated content
  data = {
      "message": commit_message,
      "content": encoded_content,
      "sha": file_sha,
      "branch": BRANCH,
  }
  update_response = requests.put(url, headers=headers, json=data)

  if update_response.status_code == 200:
      print("Logging file updated successfully! Here's what was added to the logs:")
      print(date_only + ": " + logging_text)
  else:
      print(f"Error: {update_response.json()}")

In [9]:
# initialize dynamodb and s3
try:
  dynamodb = aws_session.resource(DDB)
  table = dynamodb.Table(DDB_TABLE)
except:
  print('ERROR - failed to get dynamo db table')
  logging_aggregator('ERROR - failed to get dynamo db table')
  # return with error
try:
  s3 = aws_session.client(S3)
  buckets = s3.list_buckets()
  bucket = s3.list_objects_v2(Bucket=S3_BUCKET)
except:
  print('ERROR - failed to get s3 bucket')
  logging_aggregator('ERROR - failed to get s3 bucket')
  # return with error

In [10]:
# keep track of everything in a set so we don't have duplicates
follows_aggregation = set()

In [11]:
# there should NOT be anything in the current s3 object for this bucket. But just in case there is, like one week didn't properly get cleared out or something, we will add it to the beginning of the aggregation
try:
  s3.head_object(Bucket=S3_BUCKET, Key=ddbs3_key)
  # except on saturdays - there should be the stuff in friday in the bucket when we check on saturday
  if yesterday != 'Friday':
    print("WARNING - Object existed in s3 bucket. Aggregating to current results.")
    logging_aggregator("WARNING - Object existed in s3 bucket. Aggregating to current results.")
  response = s3.get_object(Bucket=S3_BUCKET, Key=ddbs3_key)
  # creates a list from the json info in the s3 bucket
  data = json.loads(response["Body"].read())
  # add all items from the list into our current set
  follows_aggregation.update(data)
except s3.exceptions.ClientError as e:
  if e.response["Error"]["Code"] == "404":
      print("Clear to proceed - object did not exist in s3 bucket")
except Exception as e:
  err = f'ERROR - failed to get s3 object'
  print(err)
  logging_aggregator(err)

Clear to proceed - object did not exist in s3 bucket


In [12]:
# now it's time to iterate through our the attributes on our dynamodb key
count_runs_combined = 0 # for logging purposes
try:
  ddb_response = table.get_item(
      Key={'DOW': ddbs3_key},
  )
except Exception as e:
  print(f"ERROR - failed to check key's existence: {e}")
  logging_aggregator(f"ERROR - failed to check key's existence: {e}")

print('ddb response:', ddb_response)
# this if else checks to see if there is anything
if 'Item' not in ddb_response:
  print('WARNING - found no items in this key, runs may have failed yesterday')
  logging_aggregator('WARNING - found no items in this key, runs may have failed yesterday')
else:
  count_runs_combined = len(ddb_response['Item'])
  for attribute in ddb_response['Item']: # this iterates through all the attributes in the key
    # for val in ddb_response['Item'][attribute]: # and this iterates through all of the values in the value of that key
    # instead of iterating through all the values we'll just add them all into the set directly
    # print('attr', ddb_response['Item'][attribute])
    follows_aggregation.update(ddb_response['Item'][attribute])

  # after finishing iterating through all of the attributes we can delete this key from the dynamodb to clear out all the previous runs
  try:
    table.delete_item(
      Key={'DOW': ddbs3_key}
    )
  except Exception as e:
    print(f"ERROR - failed to delete item {ddbs3_key} from dynamodb: {e}")
    logging_aggregator(f"ERROR - failed to delete item {ddbs3_key} from dynamodb: {e}")

# print(follows_aggregation)

ddb response: {'Item': {'2025-02-20 16:15:50.170188-05:00': {'did:plc:3p5yobuwhtr6m4oktlslrw63', 'did:plc:ykdxd6mepbbewcqjfxl5xmmy', 'did:plc:weu25d43k3b3xeuj2nhmgsr2', 'did:plc:yaemkgtk4cbzyohymf52kgfv', 'did:plc:rhc3xfci2tpbgsu7rv4ooiae', 'did:plc:mwerdufq5sdyllp6f4tfjfa2', 'did:plc:24l4az7vv77vnazhmbub55kc', 'did:plc:6kxixocf4bphpsqckg75ek2x', 'did:plc:dlauuuo3c7qtiucmgm4z7qty', 'did:plc:edt6ns5ccxpnmxmur2rom3wu', 'did:plc:jwm5p6spdmhcwnuntfham5e5', 'did:plc:7ehfiretoz6dnlhbboundt4t', 'did:plc:fx5hwerep44vrqhhxbx64l6v', 'did:plc:kdnxvnpopg4rnuw5gwma7ryx', 'did:plc:ifnh47bjqadbg3fl2lqn2cwp', 'did:plc:okh5ws47pn4cud4esgl4un5s', 'did:plc:ncl3lzuxc2j4p6d7kpibvbz4', 'did:plc:5sxce5v5cxjr2o6pvkmtjrjn', 'did:plc:ue3uhkutvmcfgnledwltw4wr', 'did:plc:x4keitvdrp3vjno2e44gg5qh', 'did:plc:k3hx2ab2xpxjyzlgw6yhjtbv', 'did:plc:ne74c3s34uutqbrjwvo4dfia', 'did:plc:xaefink6pzagwfgh4fbrtfcy', 'did:plc:xx6u5scnxoukbxue3m5uwv3h', 'did:plc:nkrtz7w2yjr4nzyfyy33s47l', 'did:plc:axxqc2d6niip62l5ddp6btfi', 'di

In [13]:
# now we've aggregated all the values, so we just need to put that into s3
aggregate_list = list(follows_aggregation)
try:
  s3.put_object(
      Bucket=S3_BUCKET,
      Key=ddbs3_key,
      Body=json.dumps(aggregate_list),
      ContentType="application/json"
  )
  logging_aggregator(f'Successfully aggregated follows from {ddbs3_key}. Today there were {count_runs_combined} runs, with a total of {len(aggregate_list)} follows.')
except Exception as e:
    print(f"ERROR - failed to upload object to s3: {e}")
    logging_aggregator(f"ERROR - failed to upload object to s3: {e}")

Logging file updated successfully! Here's what was added to the logs:
2025-02-21: Successfully aggregated follows from THU. Today there were 4 runs, with a total of 2892 follows.


In [14]:
# for debugging, do not leave this in final code
# print(table.scan())

In [15]:
print(table.scan())

{'Items': [{'2025-02-21 13:54:32.639807-05:00': {'did:plc:3mzaeg6bkagfdgoiovdxly5y', 'did:plc:zs33zyfrm2v6d5cvyvdsdfql', 'did:plc:6gjcwav6lcybfragxy5gpxyy', 'did:plc:kogmyskdf6yshexbmohigm2i', 'did:plc:nwxwjj2xo74yroonvp6xomas', 'did:plc:btqs6czupp4ytt7wjxsolwgd', 'did:plc:kispeez7b6kyvj4m7ixa6t5c', 'did:plc:wowz5ow7npzpotutiwutfujj', 'did:plc:gw2j5qzkqfan2rgpd626iwac', 'did:plc:dfi3tifaj6y6fuoih3imrgeo', 'did:plc:wpfjdjyru5fx7ikv27iveknm', 'did:plc:rkv4lwghgi7huex7ix2aideu', 'did:plc:zkm6vzwyarl3imepmcaane7n', 'did:plc:u3q6i2imqmhvvywi7xk6tfvp', 'did:plc:7kyyuvokwrllfvo5nkuw3vi4', 'did:plc:zuy7q6uus5lkjzo2df2pi6um', 'did:plc:b3b4grwh6etvqlsb4d54nxbu', 'did:plc:mkxk6u7l22usajh55jd7cdd2', 'did:plc:3v3hz4eu5byq5ynncybwpb4o', 'did:plc:x75cbermigpsugis5fczki6x', 'did:plc:sgzkzfxfhgrfxn4ofq653jmj', 'did:plc:yok73xfgs4wyyyfxtbdler5x', 'did:plc:h3wrque4giqikxzh7o2nppi6', 'did:plc:yv3ctseawhzelpuq76zviyg2', 'did:plc:zx36b6dgibj5t7m5orakfn5o', 'did:plc:6gdhxkb32mxowa5rmgrimomc', 'did:plc:qja4sx