In [1]:
import json
import os
from datetime import datetime
from datetime import timedelta
from random import sample

from bson import json_util
from dotenv import load_dotenv
from pymongo import MongoClient
from pymongo.collection import Collection
from pymongo.database import Database

load_dotenv()

True

## Sampling Strategy

| Period | # Samples/Day (in thousands) | Total No of Samples (in thousands) |
| --------- | ------------- | ------- |
| February 18 - 24 | 30 | 210 |
| February 25 | 50 | 50 |
| February 26 - 28 | 15 | 45 |
| Total | - | 305 |

## Connection

In [2]:
db_user_password = os.getenv("MONGODB_USER_PASSWORD")
db_name = os.getenv("MONGODB_DB")
connection_string = str.format(
    os.getenv("MONGODB_CONNECTION_STRING"), password=db_user_password, database=db_name
)
connection = MongoClient(connection_string)

In [3]:
db: Database = connection["twitter-data"]
collection: Collection = connection["twitter-data"]["2023Elections"]

## Sampling

In [4]:
all_data = []

### February 18th - 24th: One week before elections

In [None]:
N_SAMPLES = 30000

In [None]:
for day in range(18, 25):
    cursor = collection.aggregate([
        { '$match': { 'timestamp': {
            "$gte": datetime(2023, 2, day), "$lt": datetime(2023, 2, day+1)}}},
        { '$sample': { 'size':  N_SAMPLES}}
    ])
    all_data.extend(list(cursor))

### February 25: Election Day

In [5]:
election_day_data = []

N_SAMPLES = 50000

In [None]:
# NOTE: This is untested
# Sample in batches because of MongoDB Memory Error
# Subsample later to avoid duplicates
unique_ids = set()

while len(unique_ids) < N_SAMPLES:
    cursor = collection.aggregate([
        { '$match': { 'timestamp': {
            "$gte": datetime(2023, 2, 25), "$lt": datetime(2023, 2, 26)}}},
        { '$sample': {'size':  25000}}
    ])
    new_data_sample = list(cursor)
    election_day_data.extend(new_data_sample)
    unique_ids.update([item["tweet_id"] for item in new_data_sample])

In [None]:
election_day_data = [item for item in election_day_data if item["tweet_id"] in sample(unique_ids, k=50000)]

In [None]:
all_data.extend(election_day_data)

### February 26 - 28: Three days after elections

In [None]:
N_SAMPLES = 15000

In [None]:
for day in range(26, 29):
    start_date = datetime(2023, 2, day)
    end_date = start_date + timedelta(days=1)       # Gracefully handle end-of-month

    cursor = collection.aggregate([
        { '$match': { 'timestamp': {
            "$gte": datetime(2023, 2, start_date), "$lt": datetime(2023, 2, end_date)}}},
        { '$sample': { 'size':  N_SAMPLES}}
    ])
    all_data.extend(list(cursor))

### Output

In [None]:
len(all_data)

In [None]:
dumps = json.JSONEncoder(default=json_util.default, ensure_ascii=False).encode      # Handles ObjectId which json.dumps fails with

In [None]:
with open("../data/2023Elections_sample_test.jsonl", "w") as f:
    for d in all_data:
        line = dumps(d) + "\n"
        f.write(line)