In [1]:
from elasticsearch_dsl import connections, Index, Q
from json import loads
from pandas import DataFrame, NA
from pathlib import Path

from fediverse_analysis.instance_data.analyze import Analyzer

In [2]:
ELASTIC_HOST = 'https://elasticsearch.srv.webis.de'
ELASTIC_PORT = 9200
ELASTIC_USER = 'wo84xel'
# As a way to hide the password at least from the notebook, enter a path to a file here, which only contains the password for Elastic.
ELASTIC_PASSWORD_FILE = Path('~/.local/share/passwords/webis-elasticsearch.txt').expanduser()
INDEX = 'corpus_mastodon_statuses*'

INSTANCE_DATA_PATH = Path('/mnt/ceph/storage/data-in-progress/data-teaching/theses/wstud-thesis-ernst/fedi_data/fedi_data_7.jsonl')
INSTANCES_PATH = Path('/mnt/ceph/storage/data-in-progress/data-teaching/theses/wstud-thesis-ernst/sample/03/instances.txt')
REMOVED_INSTANCES_PATH = Path('/mnt/ceph/storage/data-in-progress/data-teaching/theses/wstud-thesis-ernst/sample/03/instances_removed_for_crawling_errors.json')

NUM_EXPLICIT_INSTANCES = 10
# Upper bound for the number of buckets in Elastic aggs. Set a bit higher than the number of instances crawled.
SEARCH_MAX = 1100

In [3]:
# Connect to Elastic.
with ELASTIC_PASSWORD_FILE.open('r') as f:
    password = f.readline().strip()
elastic = connections.create_connection(
    hosts=ELASTIC_HOST + ':' + str(ELASTIC_PORT),
    basic_auth=(ELASTIC_USER, password),
    timeout=300
)

In [4]:
# Search Elastic for crawled federated data.
fed_data_search = Index(INDEX).search().params(size=0)
fed_data_search.aggs.bucket('instances', 'terms', field='crawled_from_instance.keyword', size=SEARCH_MAX)\
    .bucket('users', 'cardinality', field='account.handle.keyword')
fed_data_result = fed_data_search.execute()

In [5]:
# Search Elastic for crawled local data.
local_data_search = Index(INDEX).search().params(size=0).query('bool', filter=Q('term', is_local=True))
local_data_search.aggs.bucket('instances', 'terms', field='crawled_from_instance.keyword', size=SEARCH_MAX)\
    .bucket('users', 'cardinality', field='account.handle.keyword')
local_data_result = local_data_search.execute()

In [6]:
# Count instances and put index together.
buckets = fed_data_result.aggs['instances']['buckets']
## Determine the top 10 instances.
index = []
for i in range(NUM_EXPLICIT_INSTANCES):
    index.append(buckets[i]['key'])
instances = index.copy()
## Count 'others'.
others_str = str(len(buckets) - NUM_EXPLICIT_INSTANCES) + ' others'
index.append(others_str)
## Count 'all crawled'.
crawled_str = str(len(buckets)) + ' crawled'
index.append(crawled_str)

In [7]:
# Evaluate data
fed_posts = {}
fed_users = {}
local_posts = {}
local_users = {}
## Federated and local search results look the same, so we can do the same stuff twice.
for search_result, posts, users in (
    (fed_data_result, fed_posts, fed_users),
    (local_data_result, local_posts, local_users)
):
    ## list → dict
    data_dict = {
        entry['key']: {
            'doc_count': entry['doc_count'],
            'users': entry['users']['value']
        }
        for entry in search_result.aggs['instances']['buckets']
    }
    ## top 10
    for instance in instances:
        posts[instance] = data_dict[instance]['doc_count']
        users[instance] = data_dict[instance]['users']
        del data_dict[instance]
    ## Others / all crawled
    posts[others_str] = sum(v['doc_count'] for v in data_dict.values())
    posts[crawled_str] = sum(posts.values())
## Local users ~(top 10)
local_users[others_str] = sum(v['users']['value'] for v in local_data_result.aggs['instances']['buckets'])
## Local users all crawled
local_users[crawled_str] = sum(local_users.values())

In [8]:
# Federated users is not as easy – we need additional searches for these.
## Prepare Query for instances: ~(top 10)
not_top10_query = Q()
for instance in instances:
    not_top10_query = not_top10_query & ~Q("term", crawled_from_instance=instance)

In [9]:
## Federated, ~(top 10)
others_fed_users_search = Index(INDEX).search().params(size=0).query('bool', filter=not_top10_query)
others_fed_users_search.aggs.bucket('users', 'cardinality', field='account.handle.keyword')
others_fed_users_result = others_fed_users_search.execute()

fed_users[others_str] = others_fed_users_result.aggs['users']['value']

In [10]:
## Federated, all crawled
all_fed_users_search = Index(INDEX).search().params(size=0)
all_fed_users_search.aggs.bucket('users', 'cardinality', field='account.handle.keyword')
all_fed_users_result = all_fed_users_search.execute()

fed_users[crawled_str] = all_fed_users_result.aggs['users']['value']

In [11]:
# 'Available' data: basically the entire Mastodon network (that we know of).
## Get all instances we ever crawled.
crawled_instances = set()
with INSTANCES_PATH.open('r') as f:
    for line in f:
        crawled_instances.add(line[:-1])
with REMOVED_INSTANCES_PATH.open('r') as f:
    for instance in loads(f.readline()):
        crawled_instances.add(instance)
## File to dict
mastodon_data = {}
with open(INSTANCE_DATA_PATH, 'r') as file:
    for line in file:
        line_dict = loads(line)
        # No nodeinfo means server unreachable or other software.
        if (line_dict['nodeinfo']):
            # We need either (nodeinfo & activity) or (nodeinfo & software==mastodon)
            if (not line_dict['activity']):
                if (not line_dict['nodeinfo']['software']
                    or not line_dict['nodeinfo']['software']['name'] == 'mastodon'
                ):
                    continue
            # We only need nodeinfo.
            mastodon_data[line_dict['instance']] = line_dict['nodeinfo']

all_str = str(len(mastodon_data)) + ' discovered'
index.append(all_str)

In [12]:
avail_local_posts = {
    others_str: 0
}
avail_local_users = {
    others_str: 0
}
## top 10
for instance in instances:
    avail_local_posts[instance] = mastodon_data[instance]['usage']['localPosts']
    avail_local_users[instance] = mastodon_data[instance]['usage']['users']['total']
    del mastodon_data[instance]
    crawled_instances.remove(instance)
## We will need this later.
top10_avail_local_posts = sum(avail_local_posts.values())
top10_avail_local_users = sum(avail_local_users.values())
## Others (crawled, but not top 10)
for instance in crawled_instances:
    avail_local_posts[others_str] += mastodon_data[instance]['usage']['localPosts']
    avail_local_users[others_str] += mastodon_data[instance]['usage']['users']['total']
## All crawled
avail_local_posts[crawled_str] = sum(avail_local_posts.values())
avail_local_users[crawled_str] = sum(avail_local_users.values())
## All
avail_local_posts[all_str] = sum(v['usage']['localPosts'] for v in mastodon_data.values())
avail_local_users[all_str] = sum(v['usage']['users']['total'] for v in mastodon_data.values())
## We deleted the top 10 instances earlier, so add them now separately.
avail_local_posts[all_str] += top10_avail_local_posts
avail_local_users[all_str] += top10_avail_local_users

In [13]:
# Append NA, otherwise it will be NaN and the whole column will be converted to float.
for dict in (fed_posts, fed_users, local_posts, local_users):
    dict[all_str] = NA

df = DataFrame(
    {
        ('Crawled', 'Posts (fed.)'): fed_posts,
        ('Crawled', 'Users (fed.)'): fed_users,
        ('Crawled', 'Posts (loc.)'): local_posts,
        ('Crawled', 'Users (loc.)'): local_users,
        ('Available', 'Posts (loc.)'): avail_local_posts,
        ('Available', 'Users (loc.)'): avail_local_users
    },
    index=index
)
df.index.name = 'Instance'

In [14]:
df

Unnamed: 0_level_0,Crawled,Crawled,Crawled,Crawled,Available,Available
Unnamed: 0_level_1,Posts (fed.),Users (fed.),Posts (loc.),Users (loc.),Posts (loc.),Users (loc.)
Instance,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
mastodon.social,7871970.0,204821.0,1573339.0,54714.0,77415689,1745090
mastodon.online,5005399.0,144590.0,153566.0,4428.0,7374520,194721
mstdn.social,4926143.0,111935.0,188623.0,4715.0,14414925,219773
ohai.social,4469489.0,106323.0,10550.0,584.0,1120201,38895
mastodon.world,4152513.0,101754.0,93858.0,3064.0,4601165,180994
mas.to,4136513.0,100973.0,90925.0,3696.0,6683778,168048
universeodon.com,3796416.0,112253.0,47038.0,1338.0,2751006,77999
techhub.social,3610601.0,104471.0,38426.0,1168.0,1306092,79254
social.vivaldi.net,3608270.0,55728.0,80042.0,1337.0,1574476,41642
toot.community,3346790.0,100175.0,15303.0,638.0,1231496,30723
