In [1]:
from json import dumps, loads
from mastodon import Mastodon
from numpy import exp, log10
from pandas import read_csv, concat
from pathlib import Path
from scipy.stats import lognorm

from mastodon_search.instance_data.analyze import Analyzer

In [2]:
# To Remove
INSTANCES_TO_REMOVE = [
    'anime.kona.moe', 'bear.community', 'blimps.xyz', 'www.blimps.xyz',
    'furrypaws.cc', 'jidaar.net', 'kmy.blue', 'mstdn.lt',
    'shao.life', 'torontodiy.xyz', 'www.mastodon.scot'
]
# Input
INSTANCE_DATA_PATH = Path('../data/instance_data.jsonl')
SAMPLED_INSTANCES_PATH = Path('../data/instances.txt')
## List of instances removed during sampling because the timeline was not crawlable
REMOVED_INSTANCES_PATH = Path('../data/uncrawlable_instances.json')
READDED_INSTANCES = {
    'barkoczy.social', 'crashtodon.net', 'fedibird.com', 'liker.social',
    'metalhead.club', 'social.1up.ninja', 'verkehrswende.social'
}
# Output
## Where to put… …newly sampled instances
RESAMPLE_OUTPUT_FILE_NAME = 'instances_resample'
## …previously sampled instances plus the new ones
NEW_SAMPLE_OUTPUT_FILE_NAME = 'instances.txt'
## …all currently uncrawlable instances
UNCRAWLABLE_OUTPUT_FILE_NAME = 'uncrawlable_instances.json'

In [3]:
# Load data
with open(INSTANCE_DATA_PATH, 'r') as file:
    an = Analyzer(file)
with open(SAMPLED_INSTANCES_PATH, 'r') as file:
    sampled_instances = [instance.strip() for instance in file.readlines()]
with open(REMOVED_INSTANCES_PATH, 'r') as file:
    removed_instances = set(loads(file.readline()))
removed_instances -= READDED_INSTANCES

Number of fediverse instances in input file: 22178
Removed for (partially) no data: 11822
↳ Almost all of these instances run fediverse software other than Mastodon, some run Mastodon with a non-public API.
Removed duplicates: 2
Remaining: 10354


In [4]:
COLUMNS = ['total_users', 'monthly_users', 'total_statuses',
        'mean_weekly_statuses', 'mean_weekly_logins', 'mean_weekly_registrations']

# Prepare sample data
cols_prob_measures = {
    col: lognorm
    for col in COLUMNS
}
df = an.df
## Estimate probability distributions over activity columns
distributions = {
    col: dist.fit(df[col])
    for col, dist in cols_prob_measures.items()
}
## Compute normalize activity score by dividing by the estimated probability.
for col, dist in cols_prob_measures.items():
    shape, location, scale = distributions[col]
    df[f'{col}_log_probability'] = dist.logpdf(df[col], shape, location, scale)
## Compute joint probability (under assumption of independence; using log probabilities for numerical stability)
df['log_probability'] = 0
for col in cols_prob_measures.keys():
    df['log_probability'] += df[f'{col}_log_probability']

df.sort_values('log_probability', inplace=True)
df.drop(removed_instances, inplace=True)
df['weight'] = exp(-df['log_probability'])

  return np.sum((1 + np.log(shifted/scale)/shape**2)/shifted)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  lndata = np.log(data - loc)


In [5]:
old_sample = df.loc[sampled_instances]
# Remove non-crawlable instances
new_sample = old_sample.drop(INSTANCES_TO_REMOVE, errors='ignore')
df.drop(INSTANCES_TO_REMOVE, inplace=True, errors='ignore')
# Remove already sampled instances
df.drop(sampled_instances, inplace=True, errors='ignore')
# Find actually dropped instances
to_resample = old_sample[~old_sample.index.isin(new_sample.index)]
to_resample

Unnamed: 0_level_0,total_users,monthly_users,total_statuses,mean_weekly_statuses,mean_weekly_logins,mean_weekly_registrations,total_users_log_probability,monthly_users_log_probability,total_statuses_log_probability,mean_weekly_statuses_log_probability,mean_weekly_logins_log_probability,mean_weekly_registrations_log_probability,log_probability,weight
instance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
anime.kona.moe,638,55,1714,10.0,31.25,3.0,-10.597354,-10.386307,-9.31103,-9.327125,-9.856304,-10.677085,-60.155206,1.333748e+26
bear.community,3952,139,67788,766.5,89.5,0.75,-14.771642,-11.314889,-14.67867,-13.675097,-10.910163,-9.277382,-74.627843,2.573121e+32
blimps.xyz,555,228,182341,1092.25,175.75,0.25,-10.310173,-11.810536,-17.324384,-14.029976,-11.586049,-8.168164,-73.229283,6.354384e+31
jidaar.net,6,6,424,79.25,4.75,1.0,-3.440605,-8.167362,-8.851522,-11.401289,-7.969548,-9.567844,-49.39817,2.840225e+21
kmy.blue,2193,913,257392,5808.25,628.25,7.0,-13.338922,-13.200129,-18.36077,-15.70442,-12.861954,-11.532592,-84.998787,8.213042000000001e+36
mstdn.lt,25,9,932,37.75,5.5,0.25,-5.091011,-8.573437,-9.045384,-10.65818,-8.116373,-8.168164,-49.652548,3.662922e+21
shao.life,36,32,9313,271.5,21.0,0.25,-5.588801,-9.843877,-10.898008,-12.635124,-9.458194,-8.168164,-56.592169,3.781521e+24
torontodiy.xyz,63,32,3058,51.25,13.75,3.75,-6.412975,-9.843877,-9.696555,-10.964523,-9.034059,-10.902389,-56.854377,4.915213e+24
www.blimps.xyz,555,228,182417,1092.25,175.75,0.25,-10.310173,-11.810536,-17.325601,-14.029976,-11.586049,-8.168164,-73.2305,6.3621240000000005e+31
www.mastodon.scot,28712,3730,1662254,24825.0,2268.25,11.0,-20.18936,-14.60983,-24.977684,-17.15997,-14.147856,-11.988961,-103.073661,5.811952e+44


In [6]:
orig_df = df.copy()
new_instances = []

# Resample by choosing the instance with the weight closest to the dropped instance
for instance in to_resample.index:
    new_instance = log10(df['weight'].div(to_resample.loc[instance]['weight'])).abs().sort_values().idxmin()
    new_instances.append(new_instance)
    df.drop(new_instance, inplace=True)

resample = orig_df.loc[new_instances]
resample

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0_level_0,total_users,monthly_users,total_statuses,mean_weekly_statuses,mean_weekly_logins,mean_weekly_registrations,total_users_log_probability,monthly_users_log_probability,total_statuses_log_probability,mean_weekly_statuses_log_probability,mean_weekly_logins_log_probability,mean_weekly_registrations_log_probability,log_probability,weight
instance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
verkehrswende.social,141,72,13511,215.25,58.0,0.25,-7.727483,-10.656057,-11.446578,-12.402494,-10.475689,-8.168164,-60.876465,2.74355e+26
liker.social,3353,324,449013,1371.5,222.5,4.0,-14.363674,-12.162488,-20.157095,-14.258104,-11.822287,-10.967553,-83.7312,2.3120550000000002e+36
metalhead.club,5415,876,420355,5544.0,660.25,5.0,-15.570939,-13.158693,-19.936263,-15.657762,-12.911714,-11.192858,-88.428229,2.5344970000000003e+38
crashtodon.net,20,29,471,6.5,15.25,0.75,-4.80166,-9.745286,-8.870231,-8.895485,-9.137759,-9.277382,-50.727802,1.073506e+22
social.1up.ninja,6,3,149667,2540.0,1.75,0.25,-3.440605,-7.473181,-16.757234,-14.875617,-6.969519,-8.168164,-57.684319,1.12715e+25
communist.accountant,9,8,254,28.75,5.5,0.25,-3.861248,-8.455476,-8.782532,-10.385287,-8.116373,-8.168164,-47.76908,5.569907e+20
barkoczy.social,558,32,201,2.5,8.75,6.5,-10.321197,-9.843877,-8.760609,-7.938077,-8.581384,-11.457765,-56.902908,5.159637e+24
mastodon.cl,68,13,1987,25.0,9.5,0.25,-6.531107,-8.941717,-9.395784,-10.245245,-8.663747,-8.168164,-51.945766,3.628769e+22
typo3.social,12,9,143,8.0,6.5,0.5,-4.182936,-8.573437,-8.736401,-9.103537,-8.28368,-8.868001,-47.747992,5.453679e+20
fedibird.com,38475,12756,13716516,132120.25,10238.25,102.0,-21.066495,-15.841443,-34.49602,-18.835279,-15.657456,-14.237683,-120.134376,1.491753e+52


In [7]:
# Save new, complete instance sample to file
new_instance_sample = concat((new_sample, resample)).sort_values('instance')
new_instance_sample.reset_index()['instance'].to_csv(Path(NEW_SAMPLE_OUTPUT_FILE_NAME), index=False, header=False)
# Full DataFrame. Maybe we want to have that data later.
resample.to_csv(Path(RESAMPLE_OUTPUT_FILE_NAME + '_full_data.csv'))
# Raw instance list only.
resampled_instances = resample.reset_index()['instance']
resampled_instances.to_csv(Path(RESAMPLE_OUTPUT_FILE_NAME + '.txt'), index=False, header=False)
# Uncrawlable instances
with open(UNCRAWLABLE_OUTPUT_FILE_NAME, mode='w+') as f:
    f.write(dumps(sorted(removed_instances | set(INSTANCES_TO_REMOVE))))

In [8]:
# Test if instances are crawlable
for instance in resampled_instances:
    m = Mastodon(api_base_url=instance)
    assert len(m.timeline(timeline='public')) > 0