In [1]:
from elasticsearch_dsl import connections, Index, Search
from elasticsearch_dsl.query import Range, Terms, Term
from elasticsearch_dsl.aggs import Terms as PerTerms, Cardinality
from json import loads, load
from pandas import DataFrame, NA
from pathlib import Path
from favicon import get as get_favicon
from requests import get

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  from pandas import DataFrame, NA


In [2]:
ELASTIC_HOST = "https://elasticsearch.srv.webis.de"
ELASTIC_PORT = 9200
ELASTIC_USER = "ajjxp"
# As a way to hide the password at least from the notebook, enter a path to a file here, which only contains the password for Elastic.
ELASTIC_PASSWORD_FILE = Path("~/.local/share/passwords/webis-elasticsearch.txt").expanduser()
INDEX = "corpus_mastodon_statuses*"

INSTANCE_DATA_PATH = Path("/mnt/ceph/storage/data-in-progress/data-teaching/theses/wstud-thesis-ernst/fedi_data/2024-01-30/05.jsonl")
INSTANCES_PATH = Path("/mnt/ceph/storage/data-in-progress/data-teaching/theses/wstud-thesis-ernst/sample/04/instances.txt")
REMOVED_INSTANCES_PATH = Path("/mnt/ceph/storage/data-in-progress/data-teaching/theses/wstud-thesis-ernst/sample/04/instances_removed_for_crawling_errors.json")

OUTPUT_PATH = Path("../data")

NUM_TOP_INSTANCES = 10

# Limit the Elastic searches to a specific date range. Crawling started on 2023-12-21.
DATE_AFTER = "2023-12-01T00:00:00"
## Ca. "2024-01-30T12:00:00" is the time when a new version of the fediverse data was gahtered.
DATE_BEFORE = "2024-02-22T00:00:00"

In [3]:
# Connect to Elastic.
with ELASTIC_PASSWORD_FILE.open("r") as f:
    password = f.readline().strip("\n")
elastic = connections.create_connection(
    hosts=ELASTIC_HOST + ":" + str(ELASTIC_PORT),
    basic_auth=(ELASTIC_USER, password),
    timeout=300
)

In [4]:
# Prepare the date query
date_query = Range(crawled_at={"gte": DATE_AFTER, "lte": DATE_BEFORE, "format" : "date_hour_minute_second"})
date_query.to_dict()

{'range': {'crawled_at': {'gte': '2023-12-01T00:00:00',
   'lte': '2024-02-22T00:00:00',
   'format': 'date_hour_minute_second'}}}

In [5]:
base_search: Search = Index(INDEX)\
    .search()\
    .filter(date_query)
base_search.count()

733422534

In [6]:
crawled_instances_count_search: Search = base_search.params(size=0)
crawled_instances_count_search.aggs\
    .metric(
        "instances", 
        Cardinality(field="crawled_from_instance.keyword")
    )
crawled_instances_count = crawled_instances_count_search.execute()\
    .aggs.instances.value
crawled_instances_count

1015

In [10]:
crawled_counts_search: Search = base_search.params(size=0)
crawled_counts_search.aggs\
    .bucket(
        "instances",
        PerTerms(field="crawled_from_instance.keyword", size=NUM_TOP_INSTANCES)
    )\
    .metric(
        "uri", 
        Cardinality(field="uri.keyword")
    )
    
crawled_counts_buckets = crawled_counts_search.execute().aggs.instances.buckets
crawled_counts = {
    bucket.key: bucket.uri.value
    for bucket in crawled_counts_buckets
}
crawled_counts

{'mastodon.social': 15339031,
 'mastodon.online': 9903317,
 'mstdn.social': 9544636,
 'ohai.social': 8706348,
 'mas.to': 8200339,
 'mastodon.world': 8111819,
 'universeodon.com': 7453409,
 'social.vivaldi.net': 7165129,
 'techhub.social': 7112351,
 'toot.community': 6599259}

In [15]:
crawled_counts_search: Search = base_search.params(size=0)
crawled_counts_search.aggs\
    .metric(
        "uri", 
        Cardinality(field="uri.keyword")
    )
    
total_crawled_count = crawled_counts_search.execute().aggs.uri.value
total_crawled_count

35300568

In [55]:
crawled_federated_counts_search: Search = base_search.params(size=0)\
    .filter(Term(is_local=False))\
    .filter(Terms(crawled_from_instance=list(crawled_counts.keys())))
crawled_federated_counts_search.aggs\
    .bucket(
        "instances",
        PerTerms(field="crawled_from_instance.keyword", size=NUM_TOP_INSTANCES)
    )\
    .metric(
        "uri", 
        Cardinality(field="uri.keyword")
    )
    
crawled_federated_counts_buckets = crawled_federated_counts_search.execute().aggs.instances.buckets
crawled_federated_counts = {
    bucket.key: bucket.uri.value
    for bucket in crawled_federated_counts_buckets
}
crawled_federated_counts

{'mastodon.social': 12419766,
 'mastodon.online': 9637941,
 'mstdn.social': 9161146,
 'ohai.social': 8691028,
 'mas.to': 8022495,
 'mastodon.world': 7922482,
 'universeodon.com': 7358211,
 'social.vivaldi.net': 7033279,
 'techhub.social': 7042597,
 'toot.community': 6575255}

In [63]:
crawled_federated_counts_search: Search = base_search.params(size=0)\
    .filter(Term(is_local=False))
crawled_federated_counts_search.aggs\
    .metric(
        "uri", 
        Cardinality(field="uri.keyword")
    )
    
total_crawled_federated_count = crawled_federated_counts_search.execute().aggs.uri.value
total_crawled_federated_count

34780419

In [57]:
crawled_local_counts_search: Search = base_search.params(size=0)\
    .filter(Term(is_local=True))\
    .filter(Terms(crawled_from_instance=list(crawled_counts.keys())))
crawled_local_counts_search.aggs\
    .bucket(
        "instances",
        PerTerms(field="crawled_from_instance.keyword", size=NUM_TOP_INSTANCES)
    )\
    .metric(
        "uri", 
        Cardinality(field="uri.keyword", precision_threshold=10)
    )
    
crawled_local_counts_buckets = crawled_local_counts_search.execute().aggs.instances.buckets
crawled_local_counts = {
    bucket.key: bucket.uri.value
    for bucket in crawled_local_counts_buckets
}
crawled_local_counts

{'mastodon.social': 3213603,
 'mstdn.social': 361335,
 'mastodon.online': 254626,
 'mastodon.world': 208671,
 'mas.to': 182123,
 'social.vivaldi.net': 133506,
 'universeodon.com': 98605,
 'techhub.social': 71772,
 'toot.community': 29977,
 'ohai.social': 19110}

In [64]:
crawled_local_counts_search: Search = base_search.params(size=0)\
    .filter(Term(is_local=True))
crawled_local_counts_search.aggs\
    .metric(
        "uri", 
        Cardinality(field="uri.keyword")
    )
    
total_crawled_local_count = crawled_local_counts_search.execute().aggs.uri.value
total_crawled_local_count

9800888

In [65]:
def save_favicon(instance: str, quiet: bool = False) -> Path | None:
    favicon_path = OUTPUT_PATH / "favicons" / f"favicon-{instance}.png"
    favicon_path.parent.mkdir(parents=True, exist_ok=True)
    if favicon_path.exists():
        return favicon_path
    domain = instance
    if instance.count(".") > 1:
        domain = ".".join(instance.split(".")[-2:])
    if not quiet:
        print(f"Searching for favicon for {domain}...")
    try:
        favicons = get_favicon(f"https://{domain}", timeout=2)
    except Exception:
        return None
    favicons = sorted(favicons, key=lambda f: f.width, reverse=True)
    favicons = [favicon for favicon in favicons if favicon.format == "png"]
    if len(favicons) == 0:
        return None
    if not quiet:
        print(f"Downloading favicon from {domain}...")
    favicon_response = get(favicons[0].url, stream=True, timeout=3)
    try:
        with favicon_path.open("wb") as image:
            for chunk in favicon_response.iter_content(1024):
                image.write(chunk)
    except Exception:
        favicon_path.unlink()
        return None
    return favicon_path

In [66]:
top_crawled_instances: list[str] = [
    instance
    for instance, _ in sorted(
        crawled_counts.items(), 
        key=lambda b: b[1],
        reverse=True,
    )[:NUM_TOP_INSTANCES]
]
top_crawled_instances

['mastodon.social',
 'mastodon.online',
 'mstdn.social',
 'ohai.social',
 'mas.to',
 'mastodon.world',
 'universeodon.com',
 'social.vivaldi.net',
 'techhub.social',
 'toot.community']

In [67]:
crawled_data = []
for instance in crawled_counts.keys():
    if instance in top_crawled_instances:
        favicon_path = save_favicon(instance)
    else:
        favicon_path = None
    crawled_data.append({
        "instance": instance,
        "icon": favicon_path if favicon_path is not None else None,
        "posts_crawled": crawled_counts.get(instance, 0),
        "posts_crawled_federated": crawled_federated_counts.get(instance, 0),
        "posts_crawled_local": crawled_local_counts.get(instance, 0),
    })

df_crawled = DataFrame(crawled_data)
df_crawled["percentage_crawled_federated_of_all_crawled_federated"] = \
    df_crawled["posts_crawled_federated"] / total_crawled_federated_count
df_crawled["percentage_crawled_local_of_all_crawled_local"] = \
    df_crawled["posts_crawled_local"] / total_crawled_local_count
df_crawled["percentage_crawled_of_all_crawled"] = \
    df_crawled["posts_crawled"] / total_crawled_count
df_crawled.head(n=10).style\
    .format({
        col: "{:,.2%}"
        for col in df_crawled.columns
        if col.startswith("percentage_")
    })

Unnamed: 0,instance,icon,posts_crawled,posts_crawled_federated,posts_crawled_local,percentage_crawled_federated_of_all_crawled_federated,percentage_crawled_local_of_all_crawled_local,percentage_crawled_of_all_crawled
0,mastodon.social,../data/favicons/favicon-mastodon.social.png,15339031,12419766,3213603,35.71%,32.79%,43.45%
1,mastodon.online,../data/favicons/favicon-mastodon.online.png,9903317,9637941,254626,27.71%,2.60%,28.05%
2,mstdn.social,../data/favicons/favicon-mstdn.social.png,9544636,9161146,361335,26.34%,3.69%,27.04%
3,ohai.social,../data/favicons/favicon-ohai.social.png,8706348,8691028,19110,24.99%,0.19%,24.66%
4,mas.to,../data/favicons/favicon-mas.to.png,8200339,8022495,182123,23.07%,1.86%,23.23%
5,mastodon.world,../data/favicons/favicon-mastodon.world.png,8111819,7922482,208671,22.78%,2.13%,22.98%
6,universeodon.com,../data/favicons/favicon-universeodon.com.png,7453409,7358211,98605,21.16%,1.01%,21.11%
7,social.vivaldi.net,../data/favicons/favicon-social.vivaldi.net.png,7165129,7033279,133506,20.22%,1.36%,20.30%
8,techhub.social,../data/favicons/favicon-techhub.social.png,7112351,7042597,71772,20.25%,0.73%,20.15%
9,toot.community,../data/favicons/favicon-toot.community.png,6599259,6575255,29977,18.91%,0.31%,18.69%


In [68]:
print(r"\begin{tabular}{@{}cl@{\hspace{2em}}rr@{\hspace{1.5em}}rr@{\hspace{1.5em}}rr@{}}")
print(r"  \toprule")
cols = [
    r"\multicolumn{2}{@{}l}{\textbf{Instance}}",
    r"\multicolumn{6}{c}{\textbf{\iconCrawled~Crawled posts}}",
]
print(r"  " + " & ".join(cols) + r" \\")
print(r"  \cmidrule{3-8}")
cols = [
    r"",
    r"",
    r"\multicolumn{1}{c}{\iconFederated~Fed.}",
    r"\multicolumn{1}{c@{\hspace{1.5em}}}{{\tiny\faIcon{arrow-down}}\iconFederated}",
    r"\multicolumn{1}{c}{\iconRemote~Remote}",
    r"\multicolumn{1}{c}{{\tiny\faIcon{arrow-down}}\iconRemote}",
    r"\multicolumn{1}{c}{\iconLocal~Local}",
    r"\multicolumn{1}{c}{{\tiny\faIcon{arrow-down}}\iconLocal}",
]
print(r"  " + " & ".join(cols) + r" \\")
print(r"  \midrule")
for _, row in df_crawled.iterrows():
    favicon_path = save_favicon(row["instance"], quiet=True)
    cols = [
        r"\favicon{" + (row["instance"] if favicon_path is not None else "mastodon.social") + r"}",
        r"\instance{" + row["instance"] + r"}",
        f"{row['posts_crawled']:,d}",
        f"{row['percentage_crawled_of_all_crawled']:,.0%}".replace("%", r"\%"),
        f"{row['posts_crawled_federated']:,d}",
        f"{row['percentage_crawled_federated_of_all_crawled_federated']:,.0%}".replace("%", r"\%"),
        f"{row['posts_crawled_local']:,d}",
        f"{row['percentage_crawled_local_of_all_crawled_local']:,.0%}".replace("%", r"\%"),
    ]
    print(r"  " + " & ".join(cols) + r" \\")
print(r"  \midrule")
cols = [
    r"\iconCrawled",
    f"{crawled_instances_count:,d}" + r" crawled",
    f"{total_crawled_count:,d}",
    f"{1:,.0%}".replace("%", r"\%"),
    f"{total_crawled_federated_count:,d}",
    f"{1:,.0%}".replace("%", r"\%"),
    f"{total_crawled_local_count:,d}",
    f"{1:,.0%}".replace("%", r"\%"),
]
print(r"  " + " & ".join(cols) + r" \\")
print(r"  \bottomrule")
print(r"\end{tabular}")

\begin{tabular}{@{}cl@{\hspace{2em}}rr@{\hspace{1.5em}}rr@{\hspace{1.5em}}rr@{}}
  \toprule
  \multicolumn{2}{@{}l}{\textbf{Instance}} & \multicolumn{6}{c}{\textbf{\iconCrawled~Crawled posts}} \\
  \cmidrule{3-8}
   &  & \multicolumn{1}{c}{\iconFederated~Fed.} & \multicolumn{1}{c@{\hspace{1.5em}}}{{\tiny\faIcon{arrow-down}}\iconFederated} & \multicolumn{1}{c}{\iconRemote~Remote} & \multicolumn{1}{c}{{\tiny\faIcon{arrow-down}}\iconRemote} & \multicolumn{1}{c}{\iconLocal~Local} & \multicolumn{1}{c}{{\tiny\faIcon{arrow-down}}\iconLocal} \\
  \midrule
  \favicon{mastodon.social} & \instance{mastodon.social} & 15,339,031 & 43\% & 12,419,766 & 36\% & 3,213,603 & 33\% \\
  \favicon{mastodon.online} & \instance{mastodon.online} & 9,903,317 & 28\% & 9,637,941 & 28\% & 254,626 & 3\% \\
  \favicon{mstdn.social} & \instance{mstdn.social} & 9,544,636 & 27\% & 9,161,146 & 26\% & 361,335 & 4\% \\
  \favicon{ohai.social} & \instance{ohai.social} & 8,706,348 & 25\% & 8,691,028 & 25\% & 19,110 & 0\% \\
