In [2]:
import pandas as pd
import json
from sseclient import SSEClient as EventSource
from bloom_filter import BloomFilter

In [3]:
url = 'https://stream.wikimedia.org/v2/stream/recentchange'
wiki = 'enwiki' #Client side filter
counter = 0
changes = []
maxEvents =  10000
for event in EventSource(url):
    if event.event == 'message':
        try:
            change = json.loads(event.data)
        except ValueError:
            continue
        if change['wiki'] == wiki:
            changes.append(change)
            counter += 1
            if counter > maxEvents:
                break





In [4]:
train = pd.json_normalize(changes)
train.to_pickle('train.pkl')
train.shape

(10001, 57)

In [5]:
train = pd.read_pickle('train.pkl')

In [6]:
bots = train.loc[train['bot']==True, 'user'].unique().tolist()

In [7]:
bloom_filter = BloomFilter(n_elements=len(bots), fp_prob=0.1)

Initiazied filter of size 138


In [8]:
for bot in bots:
    bloom_filter.insert(bot)

In [9]:
url = 'https://stream.wikimedia.org/v2/stream/recentchange'
wiki = 'enwiki' #Client side filter
counter = 0
changes = []
maxEvents =  1000 # print n events and stop
for event in EventSource(url):
    if event.event == 'message':
        try:
            change = json.loads(event.data)
        except ValueError:
            continue
        if change['wiki'] == wiki:
            counter += 1

            bot_bloom = bloom_filter.check(change['user'])
            change['bot_bloom'] = bot_bloom
            changes.append(change)

            if counter > maxEvents:
                break

In [10]:
test = pd.json_normalize(changes)
test.to_pickle('test.pkl')

In [11]:
bot_count = test[(test['bot']==True)].shape[0]
bloom_bot_count = test[(test['bot_bloom']==True)].shape[0]
tp_count = test[(test['bot']==True) & (test['bot_bloom']==True)].shape[0]
print(f'True bots: {bot_count}\nBloom filtered bots: {bloom_bot_count}\nTrue positives: {tp_count}')

True bots: 184
Bloom filtered bots: 251
True positives: 172


In [12]:
accuracy = test[test['bot'] == test['bot_bloom']].shape[0] / test.shape[0]
fp_rate = test[(test['bot']==False) & (test['bot_bloom']==True)].shape[0] / test.shape[0]
fn_rate = test[(test['bot']==True) & (test['bot_bloom']==False)].shape[0] / test.shape[0] # should be 0
fp_rate, fn_rate
print(f'Accuracy: {accuracy}\nFP rate: {fp_rate}\nFN rate: {fn_rate}')

Accuracy: 0.9090909090909091
FP rate: 0.07892107892107893
FN rate: 0.011988011988011988
