In [1]:
from json import loads
from mastodon import Mastodon
from numpy import exp, log10
from pandas import read_csv, concat
from pathlib import Path
from scipy.stats import lognorm

from fediverse_analysis.instance_data.analyze import Analyzer

In [2]:
# To Remove
INSTANCES = ['fedi.halcyon-is.land', 'fedibird.com', 'meerjungfrauengrotte.de', 'mental.social', 'penguicon.social', 'toot.rebel.ar']
# Input
INSTANCE_DATA_PATH = Path('/mnt/ceph/storage/data-in-progress/data-teaching/theses/wstud-thesis-ernst/instance-data/mastodon.jsonl')
SAMPLED_INSTANCES_PATH = Path('/mnt/ceph/storage/data-in-progress/data-teaching/theses/wstud-thesis-ernst/sample/instances.txt')
REMOVED_INSTANCES_PATH = Path('/mnt/ceph/storage/data-in-progress/data-teaching/theses/wstud-thesis-ernst/sample/instances_removed_from_sample.json')
# Output
RESAMPLE_OUTPUT_FILE_NAME = 'instances_resampled'
RESAMPLE_OUTPUT_FILE_EXT = 'txt'
NEW_FULL_SAMPLE_OUTPUT_FILE_NAME = 'instances.txt'

In [3]:
# Load data
with open(INSTANCE_DATA_PATH, 'r') as file:
    an = Analyzer(file)
with open(REMOVED_INSTANCES_PATH, 'r') as file:
    removed_instances = loads(file.readline())
with open(SAMPLED_INSTANCES_PATH, 'r') as file:
    sampled_instances = [instance.strip() for instance in file.readlines()]

Number of instances in input file: 22178
Removed for (partially) no data: 11822
Removed duplicates: 2
Remaining: 10354


In [4]:
COLUMNS = ['total_users', 'monthly_users', 'total_statuses',
        'mean_weekly_statuses', 'mean_weekly_logins', 'mean_weekly_registrations']

cols_prob_measures = {
    col: lognorm
    for col in COLUMNS
}
df = an.df
# Estimate probability distributions over activity columns
distributions = {
    col: dist.fit(df[col])
    for col, dist in cols_prob_measures.items()
}
# Compute normalize activity score by dividing by the estimated probability.
for col, dist in cols_prob_measures.items():
    shape, location, scale = distributions[col]
    df[f"{col}_log_probability"] = dist.logpdf(df[col], shape, location, scale)
# Compute joint probability (under assumption of independence; using log probabilities for numerical stability)
df["log_probability"] = 0
for col in cols_prob_measures.keys():
    df["log_probability"] += df[f"{col}_log_probability"]

df.sort_values("log_probability", inplace=True)
df.drop(removed_instances, inplace=True)
df["weight"] = exp(-df["log_probability"])

  return np.sum((1 + np.log(shifted/scale)/shape**2)/shifted)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  lndata = np.log(data - loc)


In [5]:
sample = df.loc[sampled_instances]
to_resample = df.loc[INSTANCES]

# Remove already sampled instances
df.drop(sampled_instances, inplace=True)
orig_df = df.copy()

In [6]:
new_instances = []

# Resample
for instance in to_resample.index:
    new_instance = log10(df['weight'].div(to_resample.loc[instance]['weight'])).abs().sort_values().idxmin()
    new_instances.append(new_instance)
    df.drop(new_instance, inplace=True)

resample = orig_df.loc[new_instances]

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [7]:
new_instance_sample = concat((sample.drop(INSTANCES), resample))
new_instance_sample.reset_index()['instance'].to_csv(Path(NEW_FULL_SAMPLE_OUTPUT_FILE_NAME), index=False, header=False)
# Full DataFrame. Maybe we want to have that data later.
resample.to_csv(Path(RESAMPLE_OUTPUT_FILE_NAME + '_full_data.csv'))
# Raw instance list only.
resampled_instances = resample.reset_index()['instance']
resampled_instances.to_csv(Path(RESAMPLE_OUTPUT_FILE_NAME + '.' + RESAMPLE_OUTPUT_FILE_EXT), index=False, header=False)
resampled_instances

0      mastodon.nycmesh.net
1              nicaloro.com
2              blogi.social
3    mastodon.cesko.digital
4              exito.social
5              terra.social
Name: instance, dtype: object

In [8]:
# Test if instances are crawlable
for instance in resampled_instances:
    m = Mastodon(api_base_url=instance)
    assert len(m.timeline(timeline='public')) > 0