In [29]:
from elasticsearch_dsl import connections, Index, Search
from elasticsearch_dsl.query import Range, Terms, Term, Exists
from elasticsearch_dsl.aggs import Terms as PerTerms, Cardinality, Min, Max, Missing, Percentiles
from json import loads, load
from pandas import DataFrame, NA
from pathlib import Path
from favicon import get as get_favicon
from requests import get
from tqdm.auto import tqdm
from datetime import datetime, UTC

In [2]:
ELASTIC_HOST = "https://elasticsearch.srv.webis.de"
ELASTIC_PORT = 9200
ELASTIC_USER = "ajjxp"
# As a way to hide the password at least from the notebook, enter a path to a file here, which only contains the password for Elastic.
ELASTIC_PASSWORD_FILE = Path("~/.local/share/passwords/webis-elasticsearch.txt").expanduser()
INDEX = "corpus_mastodon_statuses*"

OUTPUT_PATH = Path("../data")

NUM_EXPLICIT_INSTANCES = 10
NUM_EXPLICIT_BUCKETS = 3

# Limit the Elastic searches to a specific date range. Crawling started on 2023-12-21.
DATE_AFTER = "2023-12-01T00:00:00"
## Ca. "2024-01-30T12:00:00" is the time when a new version of the fediverse data was gahtered.
DATE_BEFORE = "2024-02-22T00:00:00"

In [3]:
# Connect to Elastic.
with ELASTIC_PASSWORD_FILE.open("r") as f:
    password = f.readline().strip("\n")
elastic = connections.create_connection(
    hosts=ELASTIC_HOST + ":" + str(ELASTIC_PORT),
    basic_auth=(ELASTIC_USER, password),
    timeout=300
)

In [4]:
# Prepare the date query
date_query = Range(crawled_at={"gte": DATE_AFTER, "lte": DATE_BEFORE, "format" : "date_hour_minute_second"})
date_query.to_dict()

{'range': {'crawled_at': {'gte': '2023-12-01T00:00:00',
   'lte': '2024-02-22T00:00:00',
   'format': 'date_hour_minute_second'}}}

In [25]:
base_search: Search = Index(INDEX)\
    .search()\
    .filter(date_query)
total_count = base_search.count()
total_count

733422366

In [6]:
def save_favicon(instance: str, quiet: bool = False) -> Path | None:
    favicon_path = OUTPUT_PATH / "favicons" / f"favicon-{instance}.png"
    favicon_path.parent.mkdir(parents=True, exist_ok=True)
    if favicon_path.exists():
        return favicon_path
    domain = instance
    if instance.count(".") > 1:
        domain = ".".join(instance.split(".")[-2:])
    if not quiet:
        print(f"Searching for favicon for {domain}...")
    try:
        favicons = get_favicon(f"https://{domain}", timeout=2)
    except Exception:
        return None
    favicons = sorted(favicons, key=lambda f: f.width, reverse=True)
    favicons = [favicon for favicon in favicons if favicon.format == "png"]
    if len(favicons) == 0:
        return None
    if not quiet:
        print(f"Downloading favicon from {domain}...")
    favicon_response = get(favicons[0].url, stream=True, timeout=3)
    try:
        with favicon_path.open("wb") as image:
            for chunk in favicon_response.iter_content(1024):
                image.write(chunk)
    except Exception:
        favicon_path.unlink()
        return None
    return favicon_path

In [7]:
source_instances_count_search: Search = base_search.params(size=0)
source_instances_count_search.aggs\
    .metric(
        "instances", 
        Cardinality(field="instance.keyword")
    )
source_instances_count = source_instances_count_search.execute()\
    .aggs.instances.value
source_instances_count

16655

In [37]:
source_uris_count_search: Search = base_search.params(size=0)
source_uris_count_search.aggs\
    .metric(
        "uris", 
        Cardinality(field="uri.keyword")
    )
source_uris_count = source_uris_count_search.execute()\
    .aggs.uris.value
source_uris_count

35300568

In [43]:
source_unique_post_counts_search: Search = base_search.params(size=0)
source_unique_post_counts_search.aggs\
    .bucket(
        "instances",
        PerTerms(field="instance.keyword", size=source_instances_count)
    )\
    .metric(
        "uris", 
        Cardinality(field="uri.keyword")
    )
    
source_unique_post_counts_buckets = source_unique_post_counts_search.execute().aggs.instances.buckets
source_post_counts = {
    bucket.key: bucket.doc_count
    for bucket in source_unique_post_counts_buckets
}
source_post_counts
source_unique_post_counts = {
    bucket.key: bucket.uris.value
    for bucket in source_unique_post_counts_buckets
}
source_unique_post_counts

{'mastodon.social': 4274826,
 'mstdn.social': 744583,
 'press.coop': 217370,
 'mastodon.online': 474488,
 'mastodon.world': 319467,
 'fedibird.com': 605355,
 'universeodon.com': 179680,
 'submarin.online': 141630,
 'infosec.exchange': 193183,
 'sportsbots.xyz': 667792,
 'mas.to': 337294,
 'misskey.io': 1797527,
 'live-theater.net': 699158,
 'chaos.social': 254880,
 'rss-mstdn.studiofreesia.com': 205544,
 'botsin.space': 223952,
 'masto.globaleas.org': 309973,
 'misskey.gg': 65127,
 'masto.ai': 131717,
 'mastodonapp.uk': 133624,
 'hachyderm.io': 168019,
 'troet.cafe': 258219,
 'tech.lgbt': 218394,
 'mastodon.art': 131288,
 'mstdn.jp': 839146,
 'pubeurope.com': 67264,
 'vivaldi.net': 165435,
 'mstdn.ca': 156046,
 'mamot.fr': 101052,
 'sfba.social': 107726,
 'mastodon.nl': 154164,
 'channels.im': 156308,
 'fosstodon.org': 122733,
 'kolektiva.social': 128107,
 'beige.party': 117760,
 'pravda.me': 107177,
 'newsie.social': 77888,
 'flipboard.com': 110502,
 'gratefuldread.masto.host': 30822,

In [44]:
top_source_instances: list[str] = [
    instance
    for instance, _ in sorted(
        source_unique_post_counts.items(), 
        key=lambda b: b[1],
        reverse=True,
    )[:NUM_EXPLICIT_INSTANCES]
]
top_source_instances

['mastodon.social',
 'misskey.io',
 'mstdn.jp',
 'mstdn.social',
 'live-theater.net',
 'sportsbots.xyz',
 'rss-parrot.net',
 'fedibird.com',
 'mastodon.online',
 'misskey-square.net']

In [45]:
for instance in top_source_instances:
    print(instance, source_unique_post_counts[instance], source_post_counts[instance])

mastodon.social 4274826 117824464
misskey.io 1797527 8312146
mstdn.jp 839146 4142184
mstdn.social 744583 23639564
live-theater.net 699158 8218391
sportsbots.xyz 667792 8839332
rss-parrot.net 628638 701103
fedibird.com 605355 11063632
mastodon.online 474488 19115506
misskey-square.net 427559 2237804


In [46]:
source_post_stats = {}
for instance in tqdm([*top_source_instances, None]):
    source_post_stats_search: Search = base_search.params(size=0)
    if instance is not None:
        source_post_stats_search = source_post_stats_search\
        .filter(Term(instance__keyword=instance))

    stats = {}
        
    print("Loading stats.")
    source_post_stats_search.aggs\
        .metric(
            "min_timestamp", 
            Min(field="created_at")
        )\
        .metric(
            "max_timestamp", 
            Max(field="created_at")
        )\
        .metric(
            "timestamp_percentiles", 
            Percentiles(field="created_at")
        )
    source_post_result = source_post_stats_search.execute()
    stats["count_total"] = source_post_counts[instance] if instance is not None else total_count
    stats["count_unique"] = source_unique_post_counts[instance] if instance is not None else source_uris_count
    source_post_aggs = source_post_result.aggs
    stats["min_timestamp"] = datetime.fromtimestamp(
        timestamp=source_post_aggs.min_timestamp.value / 1000, 
        tz=UTC) if source_post_aggs.min_timestamp.value is not None else None
    del source_post_aggs["min_timestamp"]
    stats["max_timestamp"] = datetime.fromtimestamp(
        timestamp=source_post_aggs.max_timestamp.value / 1000, 
        tz=UTC) if source_post_aggs.max_timestamp.value is not None else None
    del source_post_aggs["max_timestamp"]
    for k, v in source_post_aggs.timestamp_percentiles.values.to_dict().items():
        if k.endswith("_as_string"):
            continue
        stats[f"timestamp_percentile_{k}"] = datetime.fromtimestamp(timestamp=v / 1000, tz=UTC) if v is not None else None
    del source_post_aggs["timestamp_percentiles"]

    print("Loading sensitive count.")
    source_post_sensitive_stats_search: Search = source_post_stats_search\
        .filter(Term(sensitive=True))
    source_post_sensitive_stats_search.aggs\
        .metric(
            "uris", 
            Cardinality(field="uri.keyword")
        )
    stats["count_sensitive_unique"] = source_post_sensitive_stats_search\
        .execute().aggs.uris.value

    print("Loading media count.")
    source_post_media_stats_search: Search = source_post_stats_search\
        .filter(Exists(field="media_attachments.id.keyword"))
    source_post_media_stats_search.aggs\
        .metric(
            "uris", 
            Cardinality(field="uri.keyword")
        )
    stats["count_media_unique"] = source_post_media_stats_search\
        .execute().aggs.uris.value

    print("Loading media description count.")
    source_post_media_description_stats_search: Search = source_post_stats_search\
        .filter(Exists(field="media_attachments.description.keyword"))
    source_post_media_description_stats_search.aggs\
        .metric(
            "uris", 
            Cardinality(field="uri.keyword")
        )
    stats["count_media_description_unique"] = source_post_media_description_stats_search\
        .execute().aggs.uris.value

    print("Loading account bot count.")
    source_post_account_bot_stats_search: Search = source_post_stats_search\
        .filter(Term(account__bot=True))
    source_post_account_bot_stats_search.aggs\
        .metric(
            "uris", 
            Cardinality(field="uri.keyword")
        )
    stats["count_account_bot_unique"] = source_post_account_bot_stats_search\
        .execute().aggs.uris.value

    print("Loading account group count.")
    source_post_account_group_stats_search: Search = source_post_stats_search\
        .filter(Term(account__group=True))
    source_post_account_group_stats_search.aggs\
        .metric(
            "uris", 
            Cardinality(field="uri.keyword")
        )
    stats["count_account_group_unique"] = source_post_account_group_stats_search\
        .execute().aggs.uris.value

    print("Loading account locked count.")
    source_post_account_locked_stats_search: Search = source_post_stats_search\
        .filter(Term(account__locked=True))
    source_post_account_locked_stats_search.aggs\
        .metric(
            "uris", 
            Cardinality(field="uri.keyword")
        )
    stats["count_account_locked_unique"] = source_post_account_locked_stats_search\
        .execute().aggs.uris.value

    print("Loading account non-discoverable count.")
    source_post_account_non_discoverable_stats_search: Search = source_post_stats_search\
        .filter(~Term(account__discoverable=True))
    source_post_account_non_discoverable_stats_search.aggs\
        .metric(
            "uris", 
            Cardinality(field="uri.keyword")
        )
    stats["count_account_non_discoverable_unique"] = source_post_account_non_discoverable_stats_search\
        .execute().aggs.uris.value

    print("Loading account non-indexable count.")
    source_post_account_non_indexable_stats_search: Search = source_post_stats_search\
        .filter(Term(account__noindex=True))
    source_post_account_non_indexable_stats_search.aggs\
        .metric(
            "uris", 
            Cardinality(field="uri.keyword")
        )
    stats["count_account_non_indexable_unique"] = source_post_account_non_indexable_stats_search\
        .execute().aggs.uris.value

    print("Loading account note count.")
    source_post_account_note_stats_search: Search = source_post_stats_search\
        .filter(Exists(field="account.note"))
    source_post_account_note_stats_search.aggs\
        .metric(
            "uris", 
            Cardinality(field="uri.keyword")
        )
    stats["count_account_note_unique"] = source_post_account_note_stats_search\
        .execute().aggs.uris.value
    
    source_post_stats[instance] = stats
df = DataFrame([
    {
        "instance": k,
        **v,
    }
    for k, v in source_post_stats.items()
])
df

  0%|          | 0/11 [00:00<?, ?it/s]

Loading stats.
Loading sensitive count.
Loading media count.
Loading media description count.
Loading account bot count.
Loading account group count.
Loading account locked count.
Loading account non-discoverable count.
Loading account non-indexable count.
Loading account note count.


  9%|▉         | 1/11 [00:56<09:26, 56.61s/it]

Loading stats.
Loading sensitive count.
Loading media count.
Loading media description count.
Loading account bot count.
Loading account group count.
Loading account locked count.
Loading account non-discoverable count.
Loading account non-indexable count.
Loading account note count.


 18%|█▊        | 2/11 [01:02<03:59, 26.63s/it]

Loading stats.
Loading sensitive count.
Loading media count.
Loading media description count.
Loading account bot count.
Loading account group count.
Loading account locked count.
Loading account non-discoverable count.
Loading account non-indexable count.
Loading account note count.


 27%|██▋       | 3/11 [01:07<02:13, 16.67s/it]

Loading stats.
Loading sensitive count.
Loading media count.
Loading media description count.
Loading account bot count.
Loading account group count.
Loading account locked count.
Loading account non-discoverable count.
Loading account non-indexable count.
Loading account note count.


 36%|███▋      | 4/11 [01:16<01:36, 13.79s/it]

Loading stats.
Loading sensitive count.
Loading media count.
Loading media description count.
Loading account bot count.
Loading account group count.
Loading account locked count.
Loading account non-discoverable count.
Loading account non-indexable count.
Loading account note count.


 45%|████▌     | 5/11 [01:22<01:06, 11.07s/it]

Loading stats.
Loading sensitive count.
Loading media count.
Loading media description count.
Loading account bot count.
Loading account group count.
Loading account locked count.
Loading account non-discoverable count.
Loading account non-indexable count.
Loading account note count.


 55%|█████▍    | 6/11 [01:29<00:47,  9.47s/it]

Loading stats.
Loading sensitive count.
Loading media count.
Loading media description count.
Loading account bot count.
Loading account group count.
Loading account locked count.
Loading account non-discoverable count.
Loading account non-indexable count.
Loading account note count.


 64%|██████▎   | 7/11 [01:31<00:28,  7.14s/it]

Loading stats.
Loading sensitive count.
Loading media count.
Loading media description count.
Loading account bot count.
Loading account group count.
Loading account locked count.
Loading account non-discoverable count.
Loading account non-indexable count.
Loading account note count.


 73%|███████▎  | 8/11 [01:37<00:20,  6.87s/it]

Loading stats.
Loading sensitive count.
Loading media count.
Loading media description count.
Loading account bot count.
Loading account group count.
Loading account locked count.
Loading account non-discoverable count.
Loading account non-indexable count.
Loading account note count.


 82%|████████▏ | 9/11 [01:51<00:17,  8.89s/it]

Loading stats.
Loading sensitive count.
Loading media count.
Loading media description count.
Loading account bot count.
Loading account group count.
Loading account locked count.
Loading account non-discoverable count.
Loading account non-indexable count.
Loading account note count.


 91%|█████████ | 10/11 [01:56<00:07,  7.78s/it]

Loading stats.
Loading sensitive count.
Loading media count.
Loading media description count.
Loading account bot count.
Loading account group count.
Loading account locked count.
Loading account non-discoverable count.
Loading account non-indexable count.
Loading account note count.


100%|██████████| 11/11 [05:18<00:00, 28.98s/it]


Unnamed: 0,instance,count_total,count_unique,min_timestamp,max_timestamp,timestamp_percentile_1.0,timestamp_percentile_5.0,timestamp_percentile_25.0,timestamp_percentile_50.0,timestamp_percentile_75.0,...,timestamp_percentile_99.0,count_sensitive_unique,count_media_unique,count_media_description_unique,count_account_bot_unique,count_account_group_unique,count_account_locked_unique,count_account_non_discoverable_unique,count_account_non_indexable_unique,count_account_note_unique
0,mastodon.social,117824464,4274826,2023-05-04 16:03:11+00:00,2024-02-21 23:59:58+00:00,2023-12-22 01:42:27.186092+00:00,2023-12-24 21:05:15.283617+00:00,2024-01-08 09:59:00.490641+00:00,2024-01-23 10:32:38.619847+00:00,2024-02-07 06:25:02.490638+00:00,...,2024-02-21 12:45:05.006008+00:00,158235,1097586,248948,835724,0,239597,1859787,85061,3656854
1,misskey.io,8312146,1797527,2023-11-22 16:20:15.241000+00:00,2024-02-21 23:59:54.003000+00:00,2023-12-22 03:43:38.537525+00:00,2023-12-24 15:26:59.618917+00:00,2024-01-07 02:58:59.572718+00:00,2024-01-21 12:42:11.547307+00:00,2024-02-06 18:32:38.861381+00:00,...,2024-02-21 08:43:37.694507+00:00,91124,227982,13673,71462,0,87887,95276,0,1786237
2,mstdn.jp,4142184,839146,2023-12-21 09:29:41+00:00,2024-02-21 23:59:58+00:00,2023-12-22 05:50:51.443801+00:00,2023-12-24 13:27:54.463673+00:00,2024-01-05 05:54:37.338106+00:00,2024-01-20 00:32:22.864277+00:00,2024-02-05 14:45:41.147177+00:00,...,2024-02-21 04:53:11.520227+00:00,40173,86079,2009,61762,0,44816,585827,0,779464
3,mstdn.social,23639564,744583,2023-10-17 11:21:02+00:00,2024-02-21 23:59:58+00:00,2023-12-21 23:43:59.333162+00:00,2023-12-24 17:30:29.385544+00:00,2024-01-06 22:04:35.643539+00:00,2024-01-22 19:34:29.143721+00:00,2024-02-06 21:50:55.923920+00:00,...,2024-02-21 12:33:48.357197+00:00,12331,83445,22313,138661,0,47854,272268,10081,712963
4,live-theater.net,8218391,699158,2023-12-21 12:55:52.769000+00:00,2024-02-21 23:59:43.950000+00:00,2023-12-22 07:42:30.817233+00:00,2023-12-24 11:11:45.202994+00:00,2024-01-04 05:15:01.549835+00:00,2024-01-20 06:54:04.076082+00:00,2024-02-05 10:31:33.369020+00:00,...,2024-02-21 03:52:38.040694+00:00,17034,39776,338,69,0,109461,28342,0,662370
5,sportsbots.xyz,8839332,667792,2014-08-24 20:05:23+00:00,2024-02-21 23:59:16+00:00,2023-12-22 01:40:07.407706+00:00,2023-12-25 12:50:43.222023+00:00,2024-01-07 19:31:55.512443+00:00,2024-01-22 08:19:25.050824+00:00,2024-02-07 06:27:51.216001+00:00,...,2024-02-21 13:14:16.722646+00:00,0,387504,69249,667792,0,0,0,0,667792
6,rss-parrot.net,701103,628638,2024-01-03 03:35:08+00:00,2024-02-21 23:57:42+00:00,2024-01-06 01:30:25.205261+00:00,2024-01-09 11:11:46.451557+00:00,2024-01-20 05:19:04.969653+00:00,2024-02-01 06:44:47.737070+00:00,2024-02-12 07:53:20.040515+00:00,...,2024-02-21 16:30:39.842111+00:00,0,0,0,628638,0,0,628638,0,628638
7,fedibird.com,11063632,605355,2023-12-15 04:33:42+00:00,2024-02-21 23:59:56+00:00,2023-12-22 06:32:45.762003+00:00,2023-12-24 13:44:47.954729+00:00,2024-01-05 22:56:37.030483+00:00,2024-01-21 06:48:12.649097+00:00,2024-02-05 16:45:39.110856+00:00,...,2024-02-21 07:30:01.956241+00:00,13836,56489,5368,26813,0,110678,392062,75016,605355
8,mastodon.online,19115506,474488,2023-10-17 17:38:09+00:00,2024-02-21 23:59:55+00:00,2023-12-22 00:21:31.430768+00:00,2023-12-24 20:59:46.474795+00:00,2024-01-07 01:26:17.769086+00:00,2024-01-22 16:42:37.619190+00:00,2024-02-06 18:58:48.236081+00:00,...,2024-02-21 14:11:52.880272+00:00,38412,78046,21501,122444,0,22950,125997,9775,424191
9,misskey-square.net,2237804,427559,2023-12-21 09:41:14.649000+00:00,2024-02-21 23:59:34.139000+00:00,2023-12-22 06:38:00.456262+00:00,2023-12-23 11:09:57.672612+00:00,2023-12-31 11:54:09.867732+00:00,2024-01-11 05:35:43.409555+00:00,2024-01-27 06:14:57.154998+00:00,...,2024-02-21 07:23:35.495503+00:00,20638,36323,1006,1917,0,172876,13778,0,419390


In [61]:
crawled_instances_search: Search = base_search.params(size=0)
crawled_instances_search.aggs\
    .bucket(
        "instances",
        PerTerms(field="crawled_from_instance.keyword", size=source_instances_count)
    )
crawled_instances = {
    bucket.key
    for bucket in crawled_instances_search.execute().aggs.instances.buckets
}
len(crawled_instances)

1015

In [87]:
print(r"\begin{tabular*}{\linewidth}{@{}cl@{\extracolsep{\fill}}ccrrrrrrrrrrr@{}}")
print(r"  \toprule")
cols = [
    r"\multicolumn{2}{@{}l}{\textbf{Instance}}",
    r"\textbf{Crw.}",
    r"\textbf{Days}",
    r"\textbf{Unique}",
    r"\textbf{Red.}",
    r"\textbf{Sens.}",
    r"\textbf{Media}",
    r"\textbf{Alt}",
    r"\textbf{Bot}",
    r"\textbf{Group}",
    r"\textbf{Lock.}",
    r"\textbf{Discov.}",
    r"\textbf{Index.}",
    r"\textbf{Note}",
]
print(r"  " + " & ".join(cols) + r" \\")
print(r"  \midrule")
for _, row in df[df["instance"].notna()].iterrows():
    if row["instance"] == None:
        continue
    favicon_path = save_favicon(row["instance"], quiet=True)
    cols = [
        r"\favicon{" + (row['instance'] if favicon_path is not None else "mastodon.social") + r"}",
        r"\instance{" + row['instance'] + r"}",
        r"\yes" if row["instance"] in crawled_instances else r"\no",
        f"{(row['timestamp_percentile_99.0'] - row['timestamp_percentile_1.0']).days:d}d",
        f"{row['count_unique']:,d}",
        f"{(row['count_total'] / row['count_unique']):,.0f}",
        f"{(row['count_sensitive_unique'] / row['count_unique']):,.0%}".replace("%", r"\%"),
        f"{(row['count_media_unique'] / row['count_unique']):,.0%}".replace("%", r"\%"),
        f"{(row['count_media_description_unique'] / row['count_unique']):,.0%}".replace("%", r"\%"),
        f"{(row['count_account_bot_unique'] / row['count_unique']):,.0%}".replace("%", r"\%"),
        f"{(row['count_account_group_unique'] / row['count_unique']):,.0%}".replace("%", r"\%"),
        f"{(row['count_account_locked_unique'] / row['count_unique']):,.0%}".replace("%", r"\%"),
        f"{(1 - row['count_account_non_discoverable_unique'] / row['count_unique']):,.0%}".replace("%", r"\%"),
        f"{(1 - row['count_account_non_indexable_unique'] / row['count_unique']):,.0%}".replace("%", r"\%"),
        f"{(row['count_account_note_unique'] / row['count_unique']):,.0%}".replace("%", r"\%"),
    ]
    print(r"  " + " & ".join(cols) + r" \\")
print(r"  \midrule")
row_crawled = df[df["instance"].isna()].iloc[0]
cols = [
    r"\iconCrawled",
    f"{source_instances_count:,d}" + r"~sources",
    f"{(len(crawled_instances) / source_instances_count):,.0%}".replace("%", r"\%"),
    f"{(row_crawled['timestamp_percentile_99.0'] - row_crawled['timestamp_percentile_1.0']).days:d}d",
    f"{row_crawled['count_unique']:,d}",
    f"{(row_crawled['count_total'] / row_crawled['count_unique']):,.0f}",
    f"{(row_crawled['count_sensitive_unique'] / row_crawled['count_unique']):,.0%}".replace("%", r"\%"),
    f"{(row_crawled['count_media_unique'] / row_crawled['count_unique']):,.0%}".replace("%", r"\%"),
    f"{(row_crawled['count_media_description_unique'] / row_crawled['count_unique']):,.0%}".replace("%", r"\%"),
    f"{(row_crawled['count_account_bot_unique'] / row_crawled['count_unique']):,.0%}".replace("%", r"\%"),
    f"{(row_crawled['count_account_group_unique'] / row_crawled['count_unique']):,.0%}".replace("%", r"\%"),
    f"{(row_crawled['count_account_locked_unique'] / row_crawled['count_unique']):,.0%}".replace("%", r"\%"),
    f"{(1 - row_crawled['count_account_non_discoverable_unique'] / row_crawled['count_unique']):,.0%}".replace("%", r"\%"),
    f"{(1 - row_crawled['count_account_non_indexable_unique'] / row_crawled['count_unique']):,.0%}".replace("%", r"\%"),
    f"{(row_crawled['count_account_note_unique'] / row_crawled['count_unique']):,.0%}".replace("%", r"\%"),
]
print(r"  " + " & ".join(cols) + r" \\")
print(r"  \bottomrule")
print(r"\end{tabular*}")

\begin{tabular*}{\linewidth}{@{}cl@{\extracolsep{\fill}}ccrrrrrrrrrrr@{}}
  \toprule
  \multicolumn{2}{@{}l}{\textbf{Instance}} & \textbf{Crw.} & \textbf{Days} & \textbf{Unique} & \textbf{Red.} & \textbf{Sens.} & \textbf{Media} & \textbf{Alt} & \textbf{Bot} & \textbf{Group} & \textbf{Lock.} & \textbf{Discov.} & \textbf{Index.} & \textbf{Note} \\
  \midrule
  \favicon{mastodon.social} & \instance{mastodon.social} & \yes & 61d & 4,274,826 & 28 & 4\% & 26\% & 6\% & 20\% & 0\% & 6\% & 56\% & 98\% & 86\% \\
  \favicon{misskey.io} & \instance{misskey.io} & \no & 61d & 1,797,527 & 5 & 5\% & 13\% & 1\% & 4\% & 0\% & 5\% & 95\% & 100\% & 99\% \\
  \favicon{mstdn.jp} & \instance{mstdn.jp} & \no & 60d & 839,146 & 5 & 5\% & 10\% & 0\% & 7\% & 0\% & 5\% & 30\% & 100\% & 93\% \\
  \favicon{mstdn.social} & \instance{mstdn.social} & \yes & 61d & 744,583 & 32 & 2\% & 11\% & 3\% & 19\% & 0\% & 6\% & 63\% & 99\% & 96\% \\
  \favicon{live-theater.net} & \instance{live-theater.net} & \no & 60d & 699,158 & 