In [1]:
from elasticsearch_dsl import connections, Index, Search
from elasticsearch_dsl.query import Range, Terms, Term
from elasticsearch_dsl.aggs import Terms as PerTerms, Cardinality
from pandas import DataFrame, NA
from pathlib import Path

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  from pandas import DataFrame, NA


In [34]:
ELASTIC_HOST = "https://elasticsearch.srv.webis.de"
ELASTIC_PORT = 9200
ELASTIC_USER = "ajjxp"
# As a way to hide the password at least from the notebook, enter a path to a file here, which only contains the password for Elastic.
ELASTIC_PASSWORD_FILE = Path("~/.local/share/passwords/webis-elasticsearch.txt").expanduser()
INDEX = "corpus_mastodon_statuses*"

OUTPUT_PATH = Path("../data")

NUM_EXPLICIT_INSTANCES = 10
NUM_TOP_VALUES_BUCKETS = 50
NUM_TOP_VALUES = 10

# Limit the Elastic searches to a specific date range. Crawling started on 2023-12-21.
DATE_AFTER = "2023-12-01T00:00:00"
## Ca. "2024-01-30T12:00:00" is the time when a new version of the fediverse data was gahtered.
DATE_BEFORE = "2024-02-22T00:00:00"

In [3]:
# Connect to Elastic.
with ELASTIC_PASSWORD_FILE.open("r") as f:
    password = f.readline().strip("\n")
elastic = connections.create_connection(
    hosts=ELASTIC_HOST + ":" + str(ELASTIC_PORT),
    basic_auth=(ELASTIC_USER, password),
    timeout=300
)

In [4]:
# Prepare the date query
date_query = Range(crawled_at={"gte": DATE_AFTER, "lte": DATE_BEFORE, "format" : "date_hour_minute_second"})
date_query.to_dict()

{'range': {'crawled_at': {'gte': '2023-12-01T00:00:00',
   'lte': '2024-02-22T00:00:00',
   'format': 'date_hour_minute_second'}}}

In [5]:
base_search: Search = Index(INDEX)\
    .search()\
    .filter(date_query)
base_search.count()

733422366

In [14]:
total_count = base_search.count()
total_count

733422366

In [15]:
source_uris_count_search: Search = base_search.params(size=0)
source_uris_count_search.aggs\
    .metric(
        "uris", 
        Cardinality(field="uri.keyword")
    )
source_uris_count = source_uris_count_search.execute()\
    .aggs.uris.value
source_uris_count

35300568

In [21]:
top_values_search: Search = base_search.params(size=0)
top_values_search.aggs\
    .bucket(
        "top_tags",
        PerTerms(field="tags.name.keyword", size=NUM_TOP_VALUES_BUCKETS)
    )\
    .metric(
        "uris", 
        Cardinality(field="uri.keyword")
    )
top_values_search.aggs\
    .bucket(
        "top_authors",
        PerTerms(field="account.handle.keyword", size=NUM_TOP_VALUES_BUCKETS)
    )\
    .metric(
        "uris", 
        Cardinality(field="uri.keyword")
    )
top_values_search.aggs\
    .bucket(
        "top_languages",
        PerTerms(field="language.keyword", size=NUM_TOP_VALUES_BUCKETS)
    )\
    .metric(
        "uris", 
        Cardinality(field="uri.keyword")
    )
top_values_search.aggs\
    .bucket(
        "top_application",
        PerTerms(field="application.name.keyword", size=NUM_TOP_VALUES_BUCKETS)
    )\
    .metric(
        "uris", 
        Cardinality(field="uri.keyword")
    )
    
top_values_aggs = top_values_search.execute().aggs
top_values_aggs

{'top_languages': {'doc_count_error_upper_bound': 4714, 'sum...}

In [30]:
top_tags = DataFrame([
    {
        "tag": bucket.key,
        "posts": bucket.doc_count,
        "unique_posts": bucket.uris.value,
    }
    for bucket in top_values_aggs.top_tags.buckets
])
top_tags["proportion_posts"] = top_tags["posts"] / total_count
top_tags["proportion_unique_posts"] = top_tags["unique_posts"] / source_uris_count
top_tags.sort_values("unique_posts", ascending=False, inplace=True)
top_tags.head(n=10)

Unnamed: 0,tag,posts,unique_posts,proportion_posts,proportion_unique_posts
1,news,14387099,254354,0.019616,0.007205
0,press,24635394,235316,0.03359,0.006666
40,News,1328968,202197,0.001812,0.005728
14,nowplaying,2573445,146348,0.003509,0.004146
3,nsfw,5792395,96020,0.007898,0.00272
15,bot,2524265,76058,0.003442,0.002155
2,ukraine,8075986,61792,0.011011,0.00175
9,photography,4083281,55239,0.005567,0.001565
12,music,3160517,54225,0.004309,0.001536
13,art,2906067,41924,0.003962,0.001188


In [46]:
print(r"\begin{tabular}{@{}lrr@{}}")
print(r"  \toprule")
cols = [
    r"\textbf{Tag}",
    r"\multicolumn{2}{c@{}}{\textbf{Uniq.\ posts}}",
]
print(r"  " + " & ".join(cols) + r" \\")
print(r"  \midrule")
for _, row in top_tags.head(n=NUM_TOP_VALUES).iterrows():
    cols = [
        r"\#" + row["tag"],
        f"{row['unique_posts']:,d}",
        f"{row['proportion_unique_posts']:,.1%}".replace("%", r"\%"),
    ]
    print(r"  " + " & ".join(cols) + r" \\")
print(r"  \bottomrule")
print(r"\end{tabular}")

\begin{tabular}{@{}lrr@{}}
  \toprule
  \textbf{Tag} & \multicolumn{2}{c@{}}{\textbf{Uniq.\ posts}} \\
  \midrule
  \#news & 254,354 & 0.7\% \\
  \#press & 235,316 & 0.7\% \\
  \#News & 202,197 & 0.6\% \\
  \#nowplaying & 146,348 & 0.4\% \\
  \#nsfw & 96,020 & 0.3\% \\
  \#bot & 76,058 & 0.2\% \\
  \#ukraine & 61,792 & 0.2\% \\
  \#photography & 55,239 & 0.2\% \\
  \#music & 54,225 & 0.2\% \\
  \#art & 41,924 & 0.1\% \\
  \bottomrule
\end{tabular}


In [27]:
top_authors = DataFrame([
    {
        "author": bucket.key,
        "posts": bucket.doc_count,
        "unique_posts": bucket.uris.value,
    }
    for bucket in top_values_aggs.top_authors.buckets
])
top_authors["proportion_posts"] = top_authors["posts"] / total_count
top_authors["proportion_unique_posts"] = top_authors["unique_posts"] / source_uris_count
top_authors.sort_values("unique_posts", ascending=False, inplace=True)
top_authors.head(n=10)

Unnamed: 0,author,posts,unique_posts,proportion_posts,proportion_unique_posts
20,my24group@mastodon.social,1128104,97831,0.001538,0.002771
2,europesays@pubeurope.com,3433518,43603,0.004682,0.001235
23,g1_globo@mastodon.social,1076868,39055,0.001468,0.001106
11,rawchili@channels.im,1275795,37466,0.00174,0.001061
24,prtimes@rss-mstdn.studiofreesia.com,1070209,37018,0.001459,0.001049
18,htTweets@press.coop,1156017,35133,0.001576,0.000995
4,rogue_corq@mas.corq.co,2593847,31782,0.003537,0.0009
13,usluck@channels.im,1259813,28867,0.001718,0.000818
1,realTuckFrumper@mastodon.social,4783294,27057,0.006522,0.000766
9,dnc@vive.im,1359595,24367,0.001854,0.00069


In [50]:
print(r"\begin{tabular}{@{}lr@{}}")
print(r"  \toprule")
cols = [
    r"\textbf{Account}",
    r"\textbf{Uniq.}",
]
print(r"  " + " & ".join(cols) + r" \\")
print(r"  \midrule")
for _, row in top_authors.head(n=NUM_TOP_VALUES).iterrows():
    cols = [
        row["author"].split("@")[0].replace("_", "\_"),
        f"{row['unique_posts']:,d}",
    ]
    print(r"  " + " & ".join(cols) + r" \\")
print(r"  \bottomrule")
print(r"\end{tabular}")

\begin{tabular}{@{}lr@{}}
  \toprule
  \textbf{Account} & \textbf{Uniq.} \\
  \midrule
  my24group & 97,831 \\
  europesays & 43,603 \\
  g1\_globo & 39,055 \\
  rawchili & 37,466 \\
  prtimes & 37,018 \\
  htTweets & 35,133 \\
  rogue\_corq & 31,782 \\
  usluck & 28,867 \\
  realTuckFrumper & 27,057 \\
  dnc & 24,367 \\
  \bottomrule
\end{tabular}


In [28]:
top_languages = DataFrame([
    {
        "language": bucket.key,
        "posts": bucket.doc_count,
        "unique_posts": bucket.uris.value,
    }
    for bucket in top_values_aggs.top_languages.buckets
])
top_languages["proportion_posts"] = top_languages["posts"] / total_count
top_languages["proportion_unique_posts"] = top_languages["unique_posts"] / source_uris_count
top_languages.sort_values("unique_posts", ascending=False, inplace=True)
top_languages.head(n=10)

Unnamed: 0,language,posts,unique_posts,proportion_posts,proportion_unique_posts
0,en,460162174,12315678,0.627418,0.34888
1,ja,56820942,7802875,0.077474,0.221041
2,de,56263129,1731484,0.076713,0.04905
5,zh,7793885,596070,0.010627,0.016886
3,fr,19578137,582844,0.026694,0.016511
4,es,12446913,533447,0.016971,0.015112
6,nl,6874538,265121,0.009373,0.00751
8,zh-CN,3909745,237492,0.005331,0.006728
7,pt,4505417,185635,0.006143,0.005259
9,it,3168160,161729,0.00432,0.004581


In [48]:
print(r"\begin{tabular}{@{}lrr@{}}")
print(r"  \toprule")
cols = [
    r"\textbf{Lang.}",
    r"\multicolumn{2}{c@{}}{\textbf{Unique posts}}",
]
print(r"  " + " & ".join(cols) + r" \\")
print(r"  \midrule")
for _, row in top_languages.head(n=NUM_TOP_VALUES).iterrows():
    lang = row["language"]
    if len(lang) > 2:
        lang += r"\kern-1.5em"
    cols = [
        lang,
        f"{row['unique_posts']:,d}",
        f"{row['proportion_unique_posts']:,.0%}".replace("%", r"\%"),
    ]
    print(r"  " + " & ".join(cols) + r" \\")
print(r"  \bottomrule")
print(r"\end{tabular}")

\begin{tabular}{@{}lrr@{}}
  \toprule
  \textbf{Lang.} & \multicolumn{2}{c@{}}{\textbf{Unique posts}} \\
  \midrule
  en & 12,315,678 & 35\% \\
  ja & 7,802,875 & 22\% \\
  de & 1,731,484 & 5\% \\
  zh & 596,070 & 2\% \\
  fr & 582,844 & 2\% \\
  es & 533,447 & 2\% \\
  nl & 265,121 & 1\% \\
  zh-CN\kern-1.5em & 237,492 & 1\% \\
  pt & 185,635 & 1\% \\
  it & 161,729 & 0\% \\
  \bottomrule
\end{tabular}


In [29]:
top_applications = DataFrame([
    {
        "application": bucket.key,
        "posts": bucket.doc_count,
        "unique_posts": bucket.uris.value,
    }
    for bucket in top_values_aggs.top_application.buckets
])
top_applications["proportion_posts"] = top_applications["posts"] / total_count
top_applications["proportion_unique_posts"] = top_applications["unique_posts"] / source_uris_count
top_applications.sort_values("unique_posts", ascending=False, inplace=True)
top_applications.head(n=10)

Unnamed: 0,application,posts,unique_posts,proportion_posts,proportion_unique_posts
0,Web,2315587,2317376,0.003157,0.065647
1,Mastodon for Android,453964,454959,0.000619,0.012888
2,Tusky,387842,381749,0.000529,0.010814
3,Mastodon for iOS,371320,373691,0.000506,0.010586
4,dlvr.it,368986,365610,0.000503,0.010357
5,iembot,306201,310318,0.000417,0.008791
6,Jetpack,281636,276705,0.000384,0.007839
7,RSS投稿bot,202724,202894,0.000276,0.005748
8,Ivory for iOS,169742,167653,0.000231,0.004749
9,CheapBotsTootSweet,160544,160016,0.000219,0.004533


In [49]:
print(r"\begin{tabular}{@{}lrr@{}}")
print(r"  \toprule")
cols = [
    r"\textbf{Application}",
    r"\multicolumn{2}{c@{}}{\textbf{Unique posts}}",
]
print(r"  " + " & ".join(cols) + r" \\")
print(r"  \midrule")
for _, row in top_applications.head(n=NUM_TOP_VALUES).iterrows():
    app = row["application"].replace("投稿", " ")
    if app.endswith(" for Android"):
        app = app[:-len(" for Android")] + r"~\iconAndroid"
    elif app.endswith(" for iOS"):
        app = app[:-len(" for iOS")] + r"~\iconiOS"
    elif len(app) > 15:
        app += r"\kern-0.5em"
    cols = [
        app,
        f"{row['unique_posts']:,d}",
        f"{row['proportion_unique_posts']:,.1%}".replace("%", r"\%"),
    ]
    print(r"  " + " & ".join(cols) + r" \\")
print(r"  \bottomrule")
print(r"\end{tabular}")

\begin{tabular}{@{}lrr@{}}
  \toprule
  \textbf{Application} & \multicolumn{2}{c@{}}{\textbf{Unique posts}} \\
  \midrule
  Web & 2,317,376 & 6.6\% \\
  Mastodon~\iconAndroid & 454,959 & 1.3\% \\
  Tusky & 381,749 & 1.1\% \\
  Mastodon~\iconiOS & 373,691 & 1.1\% \\
  dlvr.it & 365,610 & 1.0\% \\
  iembot & 310,318 & 0.9\% \\
  Jetpack & 276,705 & 0.8\% \\
  RSS bot & 202,894 & 0.6\% \\
  Ivory~\iconiOS & 167,653 & 0.5\% \\
  CheapBotsTootSweet\kern-0.5em & 160,016 & 0.5\% \\
  \bottomrule
\end{tabular}
