In [1]:
from json import loads
from mastodon import Mastodon
from numpy import exp, log10
from pandas import read_csv, concat
from pathlib import Path
from scipy.stats import lognorm

from fediverse_analysis.instance_data.analyze import Analyzer

In [2]:
# To Remove
INSTANCES_TO_REMOVE = ['blips2.club', 'epsilon.social', 'fedi.lat', 'fedibird.com', 'mental.social', 'penguicon.social', 'toot.rebel.ar']
# Input
INSTANCE_DATA_PATH = Path('/mnt/ceph/storage/data-in-progress/data-teaching/theses/wstud-thesis-ernst/fedi_data/2023-12-20/07.jsonl')
SAMPLED_INSTANCES_PATH = Path('/mnt/ceph/storage/data-in-progress/data-teaching/theses/wstud-thesis-ernst/sample/04/instances.txt')
## List of instances removed during sampling because the timeline was not crawlable
REMOVED_INSTANCES_PATH = Path('/mnt/ceph/storage/data-in-progress/data-teaching/theses/wstud-thesis-ernst/sample/01/instances_removed_during_sampling.json')
# Output
RESAMPLE_OUTPUT_FILE_NAME = 'instances_resample'
RESAMPLE_OUTPUT_FILE_EXT = 'txt'
NEW_SAMPLE_OUTPUT_FILE_NAME = 'instances.txt'

In [3]:
# Load data
with open(INSTANCE_DATA_PATH, 'r') as file:
    an = Analyzer(file)
with open(REMOVED_INSTANCES_PATH, 'r') as file:
    removed_instances = loads(file.readline())
with open(SAMPLED_INSTANCES_PATH, 'r') as file:
    sampled_instances = [instance.strip() for instance in file.readlines()]

Number of fediverse instances in input file: 22178
Removed for (partially) no data: 11822
↳ Almost all of these instances run fediverse software other than Mastodon, some run Mastodon with a non-public API.
Removed duplicates: 2
Remaining: 10354


In [4]:
COLUMNS = ['total_users', 'monthly_users', 'total_statuses',
        'mean_weekly_statuses', 'mean_weekly_logins', 'mean_weekly_registrations']

# Prepare sample data
cols_prob_measures = {
    col: lognorm
    for col in COLUMNS
}
df = an.df
## Estimate probability distributions over activity columns
distributions = {
    col: dist.fit(df[col])
    for col, dist in cols_prob_measures.items()
}
## Compute normalize activity score by dividing by the estimated probability.
for col, dist in cols_prob_measures.items():
    shape, location, scale = distributions[col]
    df[f"{col}_log_probability"] = dist.logpdf(df[col], shape, location, scale)
## Compute joint probability (under assumption of independence; using log probabilities for numerical stability)
df["log_probability"] = 0
for col in cols_prob_measures.keys():
    df["log_probability"] += df[f"{col}_log_probability"]

df.sort_values("log_probability", inplace=True)
df.drop(removed_instances, inplace=True)
df["weight"] = exp(-df["log_probability"])

  return np.sum((1 + np.log(shifted/scale)/shape**2)/shifted)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  lndata = np.log(data - loc)


In [5]:
old_sample = df.loc[sampled_instances]
# Remove non-crawlable instances
new_sample = old_sample.drop(INSTANCES_TO_REMOVE, errors='ignore')
df.drop(INSTANCES_TO_REMOVE, inplace=True, errors='ignore')
# Remove already sampled instances
df.drop(sampled_instances, inplace=True, errors='ignore')
# Find actually dropped instances
to_resample = old_sample[~old_sample.index.isin(new_sample.index)]
to_resample

Unnamed: 0_level_0,total_users,monthly_users,total_statuses,mean_weekly_statuses,mean_weekly_logins,mean_weekly_registrations,total_users_log_probability,monthly_users_log_probability,total_statuses_log_probability,mean_weekly_statuses_log_probability,mean_weekly_logins_log_probability,mean_weekly_registrations_log_probability,log_probability,weight
instance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
fedi.lat,4,4,2173,231.5,2.75,0.5,-3.058289,-7.761291,-9.451469,-12.47542,-7.42218,-8.868001,-49.03665,1.978548e+21


In [6]:
orig_df = df.copy()
new_instances = []

# Resample by choosing the instance with the weight closest to the dropped instance
for instance in to_resample.index:
    new_instance = log10(df['weight'].div(to_resample.loc[instance]['weight'])).abs().sort_values().idxmin()
    new_instances.append(new_instance)
    df.drop(new_instance, inplace=True)

resample = orig_df.loc[new_instances]
resample

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0_level_0,total_users,monthly_users,total_statuses,mean_weekly_statuses,mean_weekly_logins,mean_weekly_registrations,total_users_log_probability,monthly_users_log_probability,total_statuses_log_probability,mean_weekly_statuses_log_probability,mean_weekly_logins_log_probability,mean_weekly_registrations_log_probability,log_probability,weight
instance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
weatherby2378.social,2,3,6565,135.5,2.5,0.25,-2.493525,-7.473181,-10.450699,-11.938732,-7.326727,-8.168164,-47.851029,6.045576e+20


In [7]:
# Save new, complete instance sample to file
new_instance_sample = concat((new_sample, resample)).sort_values('instance')
new_instance_sample.reset_index()['instance'].to_csv(Path(NEW_SAMPLE_OUTPUT_FILE_NAME), index=False, header=False)
# Full DataFrame. Maybe we want to have that data later.
resample.to_csv(Path(RESAMPLE_OUTPUT_FILE_NAME + '_full_data.csv'))
# Raw instance list only.
resampled_instances = resample.reset_index()['instance']
resampled_instances.to_csv(Path(RESAMPLE_OUTPUT_FILE_NAME + '.' + RESAMPLE_OUTPUT_FILE_EXT), index=False, header=False)

In [8]:
# Test if instances are crawlable
for instance in resampled_instances:
    m = Mastodon(api_base_url=instance)
    assert len(m.timeline(timeline='public')) > 0